diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f19119bf6..49f2eed684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -158,6 +158,7 @@ - `groupby.all` - `groupby.unique` - `to_snowflake` + - `to_snowpark` - Make faster pandas disabled by default (opt-in instead of opt-out). - Improve performance of `drop_duplicates` by avoiding joins when `keep!=False` in faster pandas. diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index c1fc859ea5..9a8a4d73a8 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2332,6 +2332,22 @@ def _to_snowflake_internal( def to_snowpark( self, index: bool = True, index_label: Optional[IndexLabel] = None + ) -> SnowparkDataFrame: + """ + Wrapper around _to_snowpark_internal to be supported in faster pandas. + """ + if self._relaxed_query_compiler is not None and not index: + return self._relaxed_query_compiler._to_snowpark_internal( + index=index, + index_label=index_label, + ) + return self._to_snowpark_internal( + index=index, + index_label=index_label, + ) + + def _to_snowpark_internal( + self, index: bool = True, index_label: Optional[IndexLabel] = None ) -> SnowparkDataFrame: """ Convert the Snowpark pandas Dataframe to Snowpark Dataframe. The Snowpark Dataframe is created by selecting diff --git a/tests/integ/modin/test_faster_pandas.py b/tests/integ/modin/test_faster_pandas.py index 4eb4ea156b..ce349fc40c 100644 --- a/tests/integ/modin/test_faster_pandas.py +++ b/tests/integ/modin/test_faster_pandas.py @@ -1786,6 +1786,40 @@ def test_to_snowflake(session): assert_frame_equal(snow_result, native_result) +@sql_count_checker(query_count=3) +def test_to_snowpark(session): + with session_parameter_override( + session, "dummy_row_pos_optimization_enabled", True + ): + # create tables + table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE) + session.create_dataframe( + native_pd.DataFrame([[2, 12], [1, 11], [3, 13]], columns=["A", "B"]) + ).write.save_as_table(table_name, table_type="temp") + + # create snow dataframes + df = pd.read_snowflake(table_name) + sdf = df.to_snowpark(index=False) + snow_result = sdf.to_snowpark_pandas().sort_values(by="A") + + # verify that the input dataframe has a populated relaxed query compiler + assert df._query_compiler._relaxed_query_compiler is not None + assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True + # verify that the output dataframe also has a populated relaxed query compiler + assert snow_result._query_compiler._relaxed_query_compiler is not None + assert ( + snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode + is True + ) + + # create pandas dataframes + native_df = df.to_pandas() + native_result = native_df.sort_values(by="A") + + # compare results + assert_frame_equal(snow_result, native_result) + + @sql_count_checker(query_count=0) def test_dummy_row_pos_optimization_enabled_on_session(db_parameters): with Session.builder.configs(db_parameters).create() as new_session: