fix: use PyCapsule Interface instead of Dataframe Interchange Protocol

MarcoGorelli · MarcoGorelli · commit aa6132d14e4f · 2024-11-09T17:32:31.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ dev = [
     "mypy",
     "pandas-stubs",
     "pre-commit",
+    "pyarrow",
     "flit",
 ]
 docs = [
diff --git a/seaborn/_core/data.py b/seaborn/_core/data.py
@@ -269,9 +269,9 @@ def _assign_variables(
 
 def handle_data_source(data: object) -> pd.DataFrame | Mapping | None:
     """Convert the data source object to a common union representation."""
-    if isinstance(data, pd.DataFrame) or hasattr(data, "__dataframe__"):
+    if isinstance(data, pd.DataFrame) or hasattr(data, "__arrow_c_stream__"):
         # Check for pd.DataFrame inheritance could be removed once
-        # minimal pandas version supports dataframe interchange (1.5.0).
+        # minimal pandas version supports PyCapsule Interface (2.2).
         data = convert_dataframe_to_pandas(data)
     elif data is not None and not isinstance(data, Mapping):
         err = f"Data source must be a DataFrame or Mapping, not {type(data)!r}."
@@ -285,35 +285,29 @@ def convert_dataframe_to_pandas(data: object) -> pd.DataFrame:
     if isinstance(data, pd.DataFrame):
         return data
 
-    if not hasattr(pd.api, "interchange"):
-        msg = (
-            "Support for non-pandas DataFrame objects requires a version of pandas "
-            "that implements the DataFrame interchange protocol. Please upgrade "
-            "your pandas version or coerce your data to pandas before passing "
-            "it to seaborn."
-        )
-        raise TypeError(msg)
-
-    if _version_predates(pd, "2.0.2"):
-        msg = (
-            "DataFrame interchange with pandas<2.0.2 has some known issues. "
-            f"You are using pandas {pd.__version__}. "
-            "Continuing, but it is recommended to carefully inspect the results and to "
-            "consider upgrading."
-        )
-        warnings.warn(msg, stacklevel=2)
-
-    try:
-        # This is going to convert all columns in the input dataframe, even though
-        # we may only need one or two of them. It would be more efficient to select
-        # the columns that are going to be used in the plot prior to interchange.
-        # Solving that in general is a hard problem, especially with the objects
-        # interface where variables passed in Plot() may only be referenced later
-        # in Plot.add(). But noting here in case this seems to be a bottleneck.
-        return pd.api.interchange.from_dataframe(data)
-    except Exception as err:
-        msg = (
-            "Encountered an exception when converting data source "
-            "to a pandas DataFrame. See traceback above for details."
-        )
-        raise RuntimeError(msg) from err
+    if hasattr(data, '__arrow_c_stream__'):
+        try:
+            import pyarrow
+        except ImportError as err:
+            msg = "PyArrow is required for non-pandas Dataframe support."
+            raise RuntimeError(msg) from err
+        if _version_predates(pyarrow, '14.0.0'):
+            msg = "PyArrow>=14.0.0 is required for non-pandas Dataframe support."
+            raise RuntimeError(msg)
+        try:
+            # This is going to convert all columns in the input dataframe, even though
+            # we may only need one or two of them. It would be more efficient to select
+            # the columns that are going to be used in the plot prior to interchange.
+            # Solving that in general is a hard problem, especially with the objects
+            # interface where variables passed in Plot() may only be referenced later
+            # in Plot.add(). But noting here in case this seems to be a bottleneck.
+            return pyarrow.table(data).to_pandas()
+        except Exception as err:
+            msg = (
+                "Encountered an exception when converting data source "
+                "to a pandas DataFrame. See traceback above for details."
+            )
+            raise RuntimeError(msg) from err
+
+    msg = f"Expected object which implements '__arrow_c_stream__' from the PyCapsule Interface, got: {type(data)}"
+    raise TypeError(msg)
diff --git a/tests/_core/test_data.py b/tests/_core/test_data.py
@@ -425,7 +425,7 @@ def test_data_interchange(self, mock_long_df, long_df):
     )
     def test_data_interchange_failure(self, mock_long_df):
 
-        mock_long_df._data = None  # Break __dataframe__()
+        mock_long_df.__arrow_c_stream__ = lambda x: 1/0  # Break __arrow_c_stream__()
         with pytest.raises(RuntimeError, match="Encountered an exception"):
             PlotData(mock_long_df, {"x": "x"})
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -188,8 +188,8 @@ class MockInterchangeableDataFrame:
     def __init__(self, data):
         self._data = data
 
-    def __dataframe__(self, *args, **kwargs):
-        return self._data.__dataframe__(*args, **kwargs)
+    def __arrow_c_stream__(self, *args, **kwargs):
+        return self._data.__arrow_c_stream__()
 
 
 @pytest.fixture

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ dev = [`
`42`	`42`	`"mypy",`
`43`	`43`	`"pandas-stubs",`
`44`	`44`	`"pre-commit",`
	`45`	`+ "pyarrow",`
`45`	`46`	`"flit",`
`46`	`47`	`]`
`47`	`48`	`docs = [`
Original file line number	Diff line number	Diff line change
`@@ -425,7 +425,7 @@ def test_data_interchange(self, mock_long_df, long_df):`
`425`	`425`	`)`
`426`	`426`	`def test_data_interchange_failure(self, mock_long_df):`
`427`	`427`
`428`		`- mock_long_df._data = None # Break __dataframe__()`
	`428`	`+ mock_long_df.__arrow_c_stream__ = lambda x: 1/0 # Break __arrow_c_stream__()`
`429`	`429`	`with pytest.raises(RuntimeError, match="Encountered an exception"):`
`430`	`430`	`PlotData(mock_long_df, {"x": "x"})`
`431`	`431`