SNOW-1984396: [Local Testing] Fix bug in Dataframe.except_ (#3167)

sfc-gh-jrose · sfc-gh-aling · web-flow · commit c21760e5f039 · 2025-03-19T16:58:29.000-07:00
Co-authored-by: Adam Ling &lt;adam.ling@snowflake.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,10 @@
 
 #### New Features
 
+#### Bug Fixes
+
+- Fixed a bug in `Dataframe.except_` that would cause rows to be incorrectly dropped.
+
 ### Snowpark pandas API Updates
 
 #### New Features
diff --git a/src/snowflake/snowpark/mock/_plan.py b/src/snowflake/snowpark/mock/_plan.py
@@ -1060,14 +1060,15 @@ def execute_mock_plan(
                         )  # Rows that are all None/NaN in both sets
                     ]
                 elif operator == EXCEPT:
-                    res_df = res_df[
-                        ~(
-                            res_df.isin(cur_df.values.ravel()).all(axis=1)
-                        ).values  # NOT IS IN
-                        | (
-                            ~any_null_rows_in_cur_df & null_rows_in_res_df.values
-                        )  # Rows that are all None/NaN only in LEFT
-                    ]
+                    # A side-effect of Snowflake difference is that duplicates are removed from the left side
+                    res_df = res_df.drop_duplicates()
+                    sf_types = res_df.sf_types
+
+                    # Two copies of the right side ensures that all rows present there are dropped when keep=False
+                    res_df = pd.concat([res_df, cur_df, cur_df]).drop_duplicates(
+                        keep=False
+                    )
+                    res_df.sf_types = sf_types
 
                 # Compute drop duplicates
                 res_df = res_df.drop_duplicates()
diff --git a/tests/integ/scala/test_dataframe_set_operations_suite.py b/tests/integ/scala/test_dataframe_set_operations_suite.py
@@ -132,6 +132,14 @@ def test_except_between_two_projects_without_references_used_in_filter(session):
     Utils.check_answer(df1.select("b").except_(df2.select("c")), Row(2))
 
 
+def test_dataframe_except_edge_cases(session):
+    # Tests that all None row is kept, and values split between rows are respected
+    df1 = session.create_dataframe([[None, None], [1, 2], [1, 2], [3, 4], [5, 6]])
+    df2 = session.create_dataframe([[1, 1], [2, 2], [5, 6]])
+
+    Utils.check_answer(df1.except_(df2), [Row(None, None), Row(1, 2), Row(3, 4)])
+
+
 def test_union_unionall_unionbyname_unionallbyname_in_one_case(session):
     df1 = session.create_dataframe([(1, 2, 3)]).to_df("a", "b", "c")
     df2 = session.create_dataframe([(3, 1, 2)]).to_df("c", "a", "b")