Implement first-class List type

WillAyd · WillAyd · commit c55bc0a9b02c · 2024-12-30T15:54:23.000-05:00
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -61,6 +61,7 @@
     PeriodDtype,
     IntervalDtype,
     DatetimeTZDtype,
+    ListDtype,
     StringDtype,
     BooleanDtype,
     # missing
@@ -261,6 +262,7 @@
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
+    "ListDtype",
     "MultiIndex",
     "NaT",
     "NamedAgg",
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -54,6 +54,7 @@
     TimedeltaArray,
 )
 from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import safe_sort_index
 
@@ -824,6 +825,11 @@ def assert_extension_array_equal(
             [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
 
+    # TODO: not every array type may be convertible to NumPy; should catch here
+    if isinstance(left.dtype, ListDtype) and isinstance(right.dtype, ListDtype):
+        assert left._pa_array == right._pa_array
+        return
+
     left_valid = left[~left_na].to_numpy(dtype=object)
     right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -40,6 +40,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import array  # noqa: ICN001
 from pandas.core.flags import Flags
@@ -103,6 +104,7 @@
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
+    "ListDtype",
     "MultiIndex",
     "NaT",
     "NamedAgg",
diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    ClassVar,
+)
+
+import numpy as np
+
+from pandas._libs import missing as libmissing
+from pandas.compat import HAS_PYARROW
+from pandas.util._decorators import set_module
+
+from pandas.core.dtypes.base import (
+    ExtensionDtype,
+    register_extension_dtype,
+)
+from pandas.core.dtypes.common import (
+    is_object_dtype,
+    is_string_dtype,
+)
+
+from pandas.core.arrays import ExtensionArray
+
+if TYPE_CHECKING:
+    from pandas._typing import type_t
+
+import pyarrow as pa
+
+
+@register_extension_dtype
+@set_module("pandas")
+class ListDtype(ExtensionDtype):
+    """
+    An ExtensionDtype suitable for storing homogeneous lists of data.
+    """
+
+    type = list
+    name: ClassVar[str] = "list"
+
+    @property
+    def na_value(self) -> libmissing.NAType:
+        return libmissing.NA
+
+    @property
+    def kind(self) -> str:
+        # TODO: our extension interface says this field should be the
+        # NumPy type character, but no such thing exists for list
+        # this assumes a PyArrow large list
+        return "+L"
+
+    @classmethod
+    def construct_array_type(cls) -> type_t[ListArray]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ListArray
+
+
+class ListArray(ExtensionArray):
+    dtype = ListDtype()
+    __array_priority__ = 1000
+
+    def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> None:
+        if not HAS_PYARROW:
+            raise NotImplementedError("ListArray requires pyarrow to be installed")
+
+        if isinstance(values, type(self)):
+            self._pa_array = values._pa_array
+        elif not isinstance(values, pa.ChunkedArray):
+            # To support NA, we need to create an Array first :-(
+            arr = pa.array(values, from_pandas=True)
+            self._pa_array = pa.chunked_array(arr)
+        else:
+            self._pa_array = values
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
+        if isinstance(scalars, ListArray):
+            return cls(scalars)
+
+        values = pa.array(scalars, from_pandas=True)
+        if values.type == "null":
+            # TODO(wayd): this is a hack to get the tests to pass, but the overall issue
+            # is that our extension types don't support parametrization but the pyarrow
+            values = pa.array(values, type=pa.list_(pa.null()))
+
+        return cls(values)
+
+    def __getitem__(self, item):
+        # PyArrow does not support NumPy's selection with an equal length
+        # mask, so let's convert those to integral positions if needed
+        if isinstance(item, np.ndarray) and item.dtype == bool:
+            pos = np.array(range(len(item)))
+            mask = pos[item]
+            return type(self)(self._pa_array.take(mask))
+        elif isinstance(item, int):  # scalar case
+            return self._pa_array[item]
+
+        return type(self)(self._pa_array[item])
+
+    def __len__(self) -> int:
+        return len(self._pa_array)
+
+    def isna(self):
+        return np.array(self._pa_array.is_null())
+
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        # TODO: what do we need to do with allow_fill and fill_value here?
+        return type(self)(self._pa_array.take(indexer))
+
+    def copy(self):
+        return type(self)(self._pa_array.take(pa.array(range(len(self._pa_array)))))
+
+    def astype(self, dtype, copy=True):
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # numpy has problems with astype(str) for nested elements
+            # and pyarrow cannot cast from list[string] to string
+            return np.array([str(x) for x in self._pa_array], dtype=dtype)
+
+        if not copy:
+            raise TypeError(f"astype from ListArray to {dtype} requires a copy")
+
+        return np.array(self._pa_array.to_pylist(), dtype=dtype, copy=copy)
+
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = [x._pa_array for x in to_concat]
+        return cls(data)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -576,7 +576,10 @@ def convert_dtypes(
     @final
     @cache_readonly
     def dtype(self) -> DtypeObj:
-        return self.values.dtype
+        try:
+            return self.values.dtype
+        except AttributeError:  # PyArrow fallback
+            return self.values.type
 
     @final
     def astype(
@@ -2234,12 +2237,16 @@ def new_block(
     *,
     ndim: int,
     refs: BlockValuesRefs | None = None,
+    dtype: DtypeObj | None,
 ) -> Block:
     # caller is responsible for ensuring:
     # - values is NOT a NumpyExtensionArray
     # - check_ndim/ensure_block_shape already checked
     # - maybe_coerce_values already called/unnecessary
-    klass = get_block_type(values.dtype)
+    if dtype:
+        klass = get_block_type(dtype)
+    else:
+        klass = get_block_type(values.dtype)
     return klass(values, ndim=ndim, placement=placement, refs=refs)
 
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1976,14 +1976,18 @@ def from_blocks(
 
     @classmethod
     def from_array(
-        cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
+        cls,
+        array: ArrayLike,
+        dtype: DtypeObj | None,
+        index: Index,
+        refs: BlockValuesRefs | None = None,
     ) -> SingleBlockManager:
         """
         Constructor for if we have an array that is not yet a Block.
         """
         array = maybe_coerce_values(array)
         bp = BlockPlacement(slice(0, len(index)))
-        block = new_block(array, placement=bp, ndim=1, refs=refs)
+        block = new_block(array, placement=bp, ndim=1, refs=refs, dtype=dtype)
         return cls(block, index)
 
     def to_2d_mgr(self, columns: Index) -> BlockManager:
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -505,7 +505,7 @@ def __init__(
                 data = data.copy()
         else:
             data = sanitize_array(data, index, dtype, copy)
-            data = SingleBlockManager.from_array(data, index, refs=refs)
+            data = SingleBlockManager.from_array(data, dtype, index, refs=refs)
 
         NDFrame.__init__(self, data)
         self.name = name
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1103,7 +1103,11 @@ def format_array(
     List[str]
     """
     fmt_klass: type[_GenericArrayFormatter]
-    if lib.is_np_dtype(values.dtype, "M"):
+    if hasattr(values, "type") and values.type == "null":
+        fmt_klass = _NullFormatter
+    if hasattr(values, "type") and str(values.type).startswith("list"):
+        fmt_klass = _ListFormatter
+    elif lib.is_np_dtype(values.dtype, "M"):
         fmt_klass = _Datetime64Formatter
         values = cast(DatetimeArray, values)
     elif isinstance(values.dtype, DatetimeTZDtype):
@@ -1467,6 +1471,27 @@ def _format_strings(self) -> list[str]:
         return fmt_values
 
 
+class _NullFormatter(_GenericArrayFormatter):
+    def _format_strings(self) -> list[str]:
+        fmt_values = [str(x) for x in self.values]
+        return fmt_values
+
+
+class _ListFormatter(_GenericArrayFormatter):
+    def _format_strings(self) -> list[str]:
+        # TODO(wayd): This doesn't seem right - where should missing values
+        # be handled
+        fmt_values = []
+        for x in self.values:
+            pyval = x.as_py()
+            if pyval:
+                fmt_values.append(pyval)
+            else:
+                fmt_values.append("")
+
+        return fmt_values
+
+
 class _Datetime64Formatter(_GenericArrayFormatter):
     values: DatetimeArray
 
diff --git a/pandas/tests/extension/list/__init__.py b/pandas/tests/extension/list/__init__.py
diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py