Skip to content

Commit c21760e

Browse files
SNOW-1984396: [Local Testing] Fix bug in Dataframe.except_ (#3167)
Co-authored-by: Adam Ling <adam.ling@snowflake.com>
1 parent db0b32c commit c21760e

File tree

3 files changed

+21
-8
lines changed

3 files changed

+21
-8
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313

1414
#### New Features
1515

16+
#### Bug Fixes
17+
18+
- Fixed a bug in `Dataframe.except_` that would cause rows to be incorrectly dropped.
19+
1620
### Snowpark pandas API Updates
1721

1822
#### New Features

src/snowflake/snowpark/mock/_plan.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,14 +1060,15 @@ def execute_mock_plan(
10601060
) # Rows that are all None/NaN in both sets
10611061
]
10621062
elif operator == EXCEPT:
1063-
res_df = res_df[
1064-
~(
1065-
res_df.isin(cur_df.values.ravel()).all(axis=1)
1066-
).values # NOT IS IN
1067-
| (
1068-
~any_null_rows_in_cur_df & null_rows_in_res_df.values
1069-
) # Rows that are all None/NaN only in LEFT
1070-
]
1063+
# A side-effect of Snowflake difference is that duplicates are removed from the left side
1064+
res_df = res_df.drop_duplicates()
1065+
sf_types = res_df.sf_types
1066+
1067+
# Two copies of the right side ensures that all rows present there are dropped when keep=False
1068+
res_df = pd.concat([res_df, cur_df, cur_df]).drop_duplicates(
1069+
keep=False
1070+
)
1071+
res_df.sf_types = sf_types
10711072

10721073
# Compute drop duplicates
10731074
res_df = res_df.drop_duplicates()

tests/integ/scala/test_dataframe_set_operations_suite.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,14 @@ def test_except_between_two_projects_without_references_used_in_filter(session):
132132
Utils.check_answer(df1.select("b").except_(df2.select("c")), Row(2))
133133

134134

135+
def test_dataframe_except_edge_cases(session):
136+
# Tests that all None row is kept, and values split between rows are respected
137+
df1 = session.create_dataframe([[None, None], [1, 2], [1, 2], [3, 4], [5, 6]])
138+
df2 = session.create_dataframe([[1, 1], [2, 2], [5, 6]])
139+
140+
Utils.check_answer(df1.except_(df2), [Row(None, None), Row(1, 2), Row(3, 4)])
141+
142+
135143
def test_union_unionall_unionbyname_unionallbyname_in_one_case(session):
136144
df1 = session.create_dataframe([(1, 2, 3)]).to_df("a", "b", "c")
137145
df2 = session.create_dataframe([(3, 1, 2)]).to_df("c", "a", "b")

0 commit comments

Comments
 (0)