Skip to content

Commit 82e510d

Browse files
SNOW-2396665: Add support for isin in faster pandas (#3856)
1 parent 3514dd9 commit 82e510d

File tree

3 files changed

+84
-0
lines changed

3 files changed

+84
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@
158158
- Improved performance of `DataFrame.to_snowflake` and `pd.to_snowflake(dataframe)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
159159
- Improved performance of `Series.to_snowflake` and `pd.to_snowflake(series)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
160160
- Set `cte_optimization_enabled` to True for all Snowpark pandas sessions.
161+
- Add support for `isin` in faster pandas.
161162

162163
## 1.39.1 (2025-09-25)
163164

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13009,6 +13009,31 @@ def isin(
1300913009
values: Union[
1301013010
list[Any], np.ndarray, "SnowflakeQueryCompiler", dict[Hashable, ListLike]
1301113011
],
13012+
) -> "SnowflakeQueryCompiler":
13013+
"""
13014+
Wrapper around _isin_internal to be supported in faster pandas.
13015+
"""
13016+
relaxed_query_compiler = None
13017+
if self._relaxed_query_compiler is not None and (
13018+
not isinstance(values, SnowflakeQueryCompiler)
13019+
or values._relaxed_query_compiler is not None
13020+
):
13021+
new_values = values
13022+
if isinstance(values, SnowflakeQueryCompiler):
13023+
assert values._relaxed_query_compiler is not None
13024+
new_values = values._relaxed_query_compiler
13025+
relaxed_query_compiler = self._relaxed_query_compiler._isin_internal(
13026+
values=new_values
13027+
)
13028+
13029+
qc = self._isin_internal(values=values)
13030+
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
13031+
13032+
def _isin_internal(
13033+
self,
13034+
values: Union[
13035+
list[Any], np.ndarray, "SnowflakeQueryCompiler", dict[Hashable, ListLike]
13036+
],
1301213037
) -> "SnowflakeQueryCompiler": # noqa: PR02
1301313038
"""
1301413039
Check for each element of `self` whether it's contained in passed `values`.

tests/integ/modin/test_faster_pandas.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,64 @@ def test_isna_notna(session, func):
220220
assert_frame_equal(snow_result, native_result, check_dtype=False)
221221

222222

223+
@sql_count_checker(query_count=3)
224+
def test_isin_list(session):
225+
# create tables
226+
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
227+
session.create_dataframe(
228+
native_pd.DataFrame([[1, 11], [2, 12], [3, 13]], columns=["A", "B"])
229+
).write.save_as_table(table_name, table_type="temp")
230+
231+
# create snow dataframes
232+
df = pd.read_snowflake(table_name)
233+
snow_result = df[df["B"].isin([12, 13])]
234+
235+
# verify that the input dataframe has a populated relaxed query compiler
236+
assert df._query_compiler._relaxed_query_compiler is not None
237+
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
238+
# verify that the output dataframe also has a populated relaxed query compiler
239+
assert snow_result._query_compiler._relaxed_query_compiler is not None
240+
assert (
241+
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
242+
)
243+
244+
# create pandas dataframes
245+
native_df = df.to_pandas()
246+
native_result = native_df[native_df["B"].isin([12, 13])]
247+
248+
# compare results
249+
assert_frame_equal(snow_result, native_result, check_dtype=False)
250+
251+
252+
@sql_count_checker(query_count=3)
253+
def test_isin_series(session):
254+
# create tables
255+
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
256+
session.create_dataframe(
257+
native_pd.DataFrame([[1, 11], [2, 12], [3, 13]], columns=["A", "B"])
258+
).write.save_as_table(table_name, table_type="temp")
259+
260+
# create snow dataframes
261+
df = pd.read_snowflake(table_name)
262+
snow_result = df[df["B"].isin(df["A"])]
263+
264+
# verify that the input dataframe has a populated relaxed query compiler
265+
assert df._query_compiler._relaxed_query_compiler is not None
266+
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
267+
# verify that the output dataframe also has a populated relaxed query compiler
268+
assert snow_result._query_compiler._relaxed_query_compiler is not None
269+
assert (
270+
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
271+
)
272+
273+
# create pandas dataframes
274+
native_df = df.to_pandas()
275+
native_result = native_df[native_df["B"].isin(native_df["A"])]
276+
277+
# compare results
278+
assert_frame_equal(snow_result, native_result, check_dtype=False)
279+
280+
223281
@sql_count_checker(query_count=0)
224282
def test_dummy_row_pos_optimization_enabled_on_session(db_parameters):
225283
with Session.builder.configs(db_parameters).create() as new_session:

0 commit comments

Comments
 (0)