[SPARK-55296][PS][FOLLOW-UP] Fix CoW mode not to break groupby

ueshin · HyukjinKwon · commit 29fc55980059 · 2026-02-20T13:26:37.000+09:00
### What changes were proposed in this pull request? This is a follow-up of #54375. Fixes CoW mode not to break `groupby`. Delays to disconnect the anchor to when actually being updated. ### Why are the changes needed? The CoW mode was supported at #54375, but it disconnected the anchor too early, causing to break `groupby`. ```py >>> import pandas as pd >>> import pyspark.pandas as ps >>> >>> pdf1 = pd.DataFrame({"C": [0.362, 0.227, 1.267, -0.562], "B": [1, 2, 3, 4]}) >>> pdf2 = pd.DataFrame({"A": [1, 1, 2, 2]}) >>> >>> psdf1 = ps.from_pandas(pdf1) >>> psdf2 = ps.from_pandas(pdf2) >>> >>> pdf1.groupby([pdf1.C, pdf2.A]).agg("sum").sort_index() B C A -0.562 2 4 0.227 1 2 0.362 1 1 1.267 2 3 >>> psdf1.groupby([psdf1.C, psdf2.A]).agg("sum").sort_index() C B C A -0.562 2 -0.562 4 0.227 1 0.227 2 0.362 1 0.362 1 1.267 2 1.267 3 ``` ### Does this PR introduce _any_ user-facing change? Yes, it will behave more like pandas 3. ### How was this patch tested? The existing tests should pass. ### Was this patch authored or co-authored using generative AI tooling? Codex (GPT-5.3-Codex) Closes #54392 from ueshin/issues/SPARK-55296/fix_groupby. Authored-by: Takuya Ueshin <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py
@@ -587,6 +587,16 @@ def __setitem__(self, key: Any, value: Any) -> None:
         from pyspark.pandas.series import Series, first_series
 
         if self._is_series:
+            if LooseVersion(pd.__version__) >= "3.0.0":
+                # pandas 3 CoW: mutating a Series view should not mutate the parent DataFrame.
+                self._psdf_or_psser._update_anchor(
+                    DataFrame(
+                        self._psdf_or_psser._psdf._internal.select_column(
+                            self._psdf_or_psser._column_label
+                        )
+                    )
+                )
+
             if (
                 isinstance(key, Series)
                 and (isinstance(self, iLocIndexer) or not same_anchor(key, self._psdf_or_psser))
@@ -811,7 +821,11 @@ def __setitem__(self, key: Any, value: Any) -> None:
             internal = self._internal.with_new_columns(
                 new_data_spark_columns, column_labels=column_labels, data_fields=new_fields
             )
-            self._psdf_or_psser._update_internal_frame(internal, check_same_anchor=False)
+            self._psdf_or_psser._update_internal_frame(
+                internal,
+                check_same_anchor=False,
+                anchor_force_disconnect=LooseVersion(pd.__version__) >= "3.0.0",
+            )
 
 
 class LocIndexer(LocIndexerLike):
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -430,10 +430,7 @@ def __init__(  # type: ignore[no-untyped-def]
             assert not copy
             assert fastpath is no_default
 
-            if LooseVersion(pd.__version__) < "3.0.0":
-                self._anchor = data
-            else:
-                self._anchor = DataFrame(data)
+            self._anchor = data
             self._col_label = index
 
         elif isinstance(data, Series):