Skip to content

Commit 76e0338

Browse files
SNOW-2432963: Add support for duplicated in faster pandas (#3904)
1 parent 0353304 commit 76e0338

File tree

3 files changed

+50
-0
lines changed

3 files changed

+50
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
- `to_datetime`
122122
- `drop`
123123
- `invert`
124+
- `duplicated`
124125
- Reuse row count from the relaxed query compiler in `get_axis_len`.
125126

126127
#### Bug Fixes

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17034,6 +17034,26 @@ def duplicated(
1703417034
self,
1703517035
subset: Union[Hashable, Sequence[Hashable]] = None,
1703617036
keep: DropKeep = "first",
17037+
) -> "SnowflakeQueryCompiler":
17038+
"""
17039+
Wrapper around _duplicated_internal to be supported in faster pandas.
17040+
"""
17041+
relaxed_query_compiler = None
17042+
if self._relaxed_query_compiler is not None:
17043+
relaxed_query_compiler = self._relaxed_query_compiler._duplicated_internal(
17044+
subset=subset,
17045+
keep=keep,
17046+
)
17047+
qc = self._duplicated_internal(
17048+
subset=subset,
17049+
keep=keep,
17050+
)
17051+
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
17052+
17053+
def _duplicated_internal(
17054+
self,
17055+
subset: Union[Hashable, Sequence[Hashable]] = None,
17056+
keep: DropKeep = "first",
1703717057
) -> "SnowflakeQueryCompiler":
1703817058
"""
1703917059
Return boolean Series denoting duplicate rows.

tests/integ/modin/test_faster_pandas.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,35 @@ def test_drop(session):
223223
assert_frame_equal(snow_result, native_result)
224224

225225

226+
@sql_count_checker(query_count=3, join_count=1)
227+
def test_duplicated(session):
228+
# create tables
229+
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
230+
session.create_dataframe(
231+
native_pd.DataFrame([[2, 12], [2, 12], [3, 13]], columns=["A", "B"])
232+
).write.save_as_table(table_name, table_type="temp")
233+
234+
# create snow dataframes
235+
df = pd.read_snowflake(table_name)
236+
snow_result = df.duplicated()
237+
238+
# verify that the input dataframe has a populated relaxed query compiler
239+
assert df._query_compiler._relaxed_query_compiler is not None
240+
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
241+
# verify that the output dataframe also has a populated relaxed query compiler
242+
assert snow_result._query_compiler._relaxed_query_compiler is not None
243+
assert (
244+
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
245+
)
246+
247+
# create pandas dataframes
248+
native_df = df.to_pandas()
249+
native_result = native_df.duplicated()
250+
251+
# compare results
252+
assert_series_equal(snow_result, native_result)
253+
254+
226255
@sql_count_checker(query_count=3)
227256
def test_invert(session):
228257
# create tables

0 commit comments

Comments
 (0)