Implement GetItem tests

WillAyd · WillAyd · commit 4a5da0ccc00a · 2025-01-03T20:01:43.000-05:00
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -428,7 +428,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
         """
         if isinstance(value, pa.Scalar):
             pa_scalar = value
-        elif isna(value):
+        elif not is_list_like(value) and isna(value):
             pa_scalar = pa.scalar(None, type=pa_type)
         else:
             # Workaround https://github.com/apache/arrow/issues/37291
@@ -1350,7 +1350,16 @@ def take(
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=fill_mask)
                 result = self._pa_array.take(indices_array)
-                if isna(fill_value):
+                if is_list_like(fill_value):
+                    # TODO: this should be hit by ListArray. Ideally we do:
+                    # pc.replace_with_mask(result, fill_mask, pa.scalar(fill_value))
+                    # but pyarrow does not yet implement that for list types
+                    new_values = [
+                        fill_value if should_fill else x.as_py()
+                        for x, should_fill in zip(result, fill_mask)
+                    ]
+                    return type(self)(new_values)
+                elif isna(fill_value):
                     return type(self)(result)
                 # TODO: ArrowNotImplementedError: Function fill_null has no
                 # kernel matching input types (array[string], scalar[string])
diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
@@ -11,10 +11,15 @@
     ExtensionDtype,
     register_extension_dtype,
 )
-from pandas.core.dtypes.common import is_string_dtype
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer_dtype,
+    is_string_dtype,
+)
 from pandas.core.dtypes.dtypes import ArrowDtype
 
 from pandas.core.arrays.arrow.array import ArrowExtensionArray
+from pandas.core.arrays.base import ExtensionArray
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -146,6 +151,15 @@ def __init__(
         else:
             if value_type is None:
                 if isinstance(values, (pa.Array, pa.ChunkedArray)):
+                    parent_type = values.type
+                    if not isinstance(parent_type, (pa.ListType, pa.LargeListType)):
+                        # Ideally could cast here, but I don't think pyarrow implements
+                        # many list casts
+                        new_values = [
+                            [x.as_py()] if x.is_valid else None for x in values
+                        ]
+                        values = pa.array(new_values, type=pa.large_list(parent_type))
+
                     value_type = values.type.value_type
                 else:
                     value_type = pa.array(values).type.value_type
@@ -193,19 +207,89 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
 
         return cls(values)
 
+    @classmethod
+    def _box_pa(
+        cls, value, pa_type: pa.DataType | None = None
+    ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
+        """
+        Box value into a pyarrow Array, ChunkedArray or Scalar.
+
+        Parameters
+        ----------
+        value : any
+        pa_type : pa.DataType | None
+
+        Returns
+        -------
+        pa.Array or pa.ChunkedArray or pa.Scalar
+        """
+        if (
+            isinstance(value, (pa.ListScalar, pa.LargeListScalar))
+            or isinstance(value, list)
+            or value is None
+        ):
+            return cls._box_pa_scalar(value, pa_type)
+        return cls._box_pa_array(value, pa_type)
+
     def __getitem__(self, item):
         # PyArrow does not support NumPy's selection with an equal length
         # mask, so let's convert those to integral positions if needed
-        if isinstance(item, np.ndarray) and item.dtype == bool:
-            pos = np.array(range(len(item)))
-            mask = pos[item]
-            return type(self)(self._pa_array.take(mask))
+        if isinstance(item, (np.ndarray, ExtensionArray)):
+            if is_bool_dtype(item.dtype):
+                mask_len = len(item)
+                if mask_len != len(self):
+                    raise IndexError(
+                        f"Boolean index has wrong length: {mask_len} "
+                        f"instead of {len(self)}"
+                    )
+                pos = np.array(range(len(item)))
+
+                if isinstance(item, ExtensionArray):
+                    mask = pos[item.fillna(False)]
+                else:
+                    mask = pos[item]
+                return type(self)(self._pa_array.take(mask))
+            elif is_integer_dtype(item.dtype):
+                if isinstance(item, ExtensionArray) and item.isna().any():
+                    msg = "Cannot index with an integer indexer containing NA values"
+                    raise ValueError(msg)
+
+                indexer = pa.array(item)
+                return type(self)(self._pa_array.take(indexer))
         elif isinstance(item, int):
-            return self._pa_array[item]
+            value = self._pa_array[item]
+            if value.is_valid:
+                return value.as_py()
+            else:
+                return self.dtype.na_value
         elif isinstance(item, list):
-            return type(self)(self._pa_array.take(item))
+            # pyarrow does not support taking yet from an empty list
+            # https://github.com/apache/arrow/issues/39917
+            if item:
+                try:
+                    result = self._pa_array.take(item)
+                except pa.lib.ArrowInvalid as e:
+                    if "Could not convert <NA>" in str(e):
+                        msg = (
+                            "Cannot index with an integer indexer containing NA values"
+                        )
+                        raise ValueError(msg) from e
+                    raise e
+            else:
+                result = pa.array([], type=self._pa_array.type)
+
+            return type(self)(result)
+
+        try:
+            result = type(self)(self._pa_array[item])
+        except TypeError as e:
+            msg = (
+                "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
+                "(`None`) and integer or boolean arrays are valid indices"
+            )
+            raise IndexError(msg) from e
 
-        return type(self)(self._pa_array[item])
+        return result
 
     def __setitem__(self, key, value) -> None:
         msg = "ListArray does not support item assignment via setitem"
@@ -241,7 +325,13 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
         return super().astype(dtype, copy)
 
     def __eq__(self, other):
-        if isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
+        if isinstance(other, list):
+            from pandas.arrays import BooleanArray
+
+            mask = np.array([False] * len(self))
+            values = np.array([x.as_py() == other for x in self._pa_array])
+            return BooleanArray(values, mask)
+        elif isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
             from pandas.arrays import BooleanArray
 
             # TODO: pyarrow.compute does not implement broadcasting equality
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -23,7 +23,6 @@
 import warnings
 
 import numpy as np
-import pyarrow as pa
 
 from pandas._config import config
 
@@ -150,6 +149,7 @@
 )
 from pandas.core.array_algos.replace import should_use_regex
 from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.base import PandasObject
 from pandas.core.construction import extract_array
 from pandas.core.flags import Flags
@@ -7013,11 +7013,20 @@ def fillna(
                         stacklevel=2,
                     )
 
+        holds_list_array = False
+        if isinstance(self, ABCSeries) and isinstance(self.dtype, ListDtype):
+            holds_list_array = True
+        elif isinstance(self, ABCDataFrame) and any(
+            isinstance(x, ListDtype) for x in self.dtypes
+        ):
+            holds_list_array = True
+
         if isinstance(value, (list, tuple)):
-            raise TypeError(
-                '"value" parameter must be a scalar or dict, but '
-                f'you passed a "{type(value).__name__}"'
-            )
+            if not holds_list_array:
+                raise TypeError(
+                    '"value" parameter must be a scalar or dict, but '
+                    f'you passed a "{type(value).__name__}"'
+                )
 
         # set the default here, so functions examining the signature
         # can detect if something was set (e.g. in groupby) (GH9221)
@@ -7037,8 +7046,9 @@ def fillna(
                 value = Series(value)
                 value = value.reindex(self.index)
                 value = value._values
-            elif isinstance(value, pa.ListScalar) or not is_list_like(value):
-                # TODO(wayd): maybe is_list_like should return false for ListScalar?
+            elif (
+                isinstance(value, list) and isinstance(self.dtype, ListDtype)
+            ) or not is_list_like(value):
                 pass
             else:
                 raise TypeError(
@@ -7102,7 +7112,7 @@ def fillna(
             else:
                 return result
 
-        elif isinstance(value, pa.ListScalar) or not is_list_like(value):
+        elif holds_list_array or not is_list_like(value):
             if axis == 1:
                 result = self.T.fillna(value=value, limit=limit).T
                 new_data = result._mgr
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
@@ -18,6 +18,7 @@
     NDArrayBacked2DTests,
 )
 from pandas.tests.extension.base.dtype import BaseDtypeTests
+from pandas.tests.extension.base.getitem import BaseGetitemTests
 from pandas.tests.extension.base.groupby import BaseGroupbyTests
 from pandas.tests.extension.base.index import BaseIndexTests
 from pandas.tests.extension.base.interface import BaseInterfaceTests
@@ -49,7 +50,7 @@ def dtype():
 def data():
     """Length-100 ListArray for semantics test."""
     # TODO: make better random data
-    data = [list("a"), list("ab"), list("abc")] * 33 + [None]
+    data = [list("a"), list("ab"), list("abc")] * 33 + [list("a")]
     return ListArray(data)
 
 
@@ -74,7 +75,7 @@ class TestListArray(
     BaseCastingTests,
     BaseConstructorsTests,
     BaseDtypeTests,
-    # BaseGetitemTests,
+    BaseGetitemTests,
     BaseGroupbyTests,
     BaseIndexTests,
     BaseInterfaceTests,
@@ -90,12 +91,12 @@ class TestListArray(
     BaseSetitemTests,
     Dim2CompatTests,
 ):
-    # TODO(wayd): The tests here are copied from test_arrow.py
-    # It appears the TestArrowArray class has different expectations around
-    # when copies should be made then the base.ExtensionTests
-    # Assuming intentional, maybe in the long term this should just
-    # inherit from TestArrowArray
     def test_fillna_no_op_returns_copy(self, data):
+        # TODO(wayd): This test is copied from test_arrow.py
+        # It appears the TestArrowArray class has different expectations around
+        # when copies should be made then the base.ExtensionTests
+        # Assuming intentional, maybe in the long term this should just
+        # inherit from TestArrowArray
         data = data[~data.isna()]
 
         valid = data[0]
@@ -154,10 +155,7 @@ def test_compare_scalar(self, data, comparison_op):
         super().test_compare_scalar(data, comparison_op)
 
     def test_compare_array(self, data, comparison_op):
-        if comparison_op in (operator.eq, operator.ne):
-            pytest.skip("Series.combine does not properly handle missing values")
-
-        super().test_compare_array(data, comparison_op)
+        pytest.skip("ListArray comparison ops are not implemented")
 
     def test_invert(self, data):
         pytest.skip("ListArray does not implement invert")
@@ -229,6 +227,9 @@ def test_unstack(self, data, index, obj):
             # result = result.astype(object)
             tm.assert_frame_equal(result, expected)
 
+    def test_getitem_ellipsis_and_slice(self, data):
+        pytest.skip("ListArray does not support NumPy style ellipsis slicing nor 2-D")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840