11from __future__ import annotations
22
3- from typing import (
4- TYPE_CHECKING ,
5- ClassVar ,
6- )
3+ from typing import TYPE_CHECKING
74
85import numpy as np
96
10- from pandas ._libs import missing as libmissing
117from pandas .compat import HAS_PYARROW
128from pandas .util ._decorators import set_module
139
1410from pandas .core .dtypes .base import (
1511 ExtensionDtype ,
1612 register_extension_dtype ,
1713)
18- from pandas .core .dtypes .common import (
19- is_object_dtype ,
20- is_string_dtype ,
21- )
14+ from pandas .core .dtypes .dtypes import ArrowDtype
2215
23- from pandas .core .arrays import ExtensionArray
16+ from pandas .core .arrays . arrow . array import ArrowExtensionArray
2417
2518if TYPE_CHECKING :
2619 from pandas ._typing import (
2720 type_t ,
2821 Shape ,
2922 )
3023
24+ import re
25+
3126import pyarrow as pa
3227
3328
29+ def string_to_pyarrow_type (string : str ) -> pa .DataType :
30+ # TODO: combine this with to_pyarrow_type in pandas.core.arrays.arrow ?
31+ pater = r"list\[(.*)\]"
32+
33+ if mtch := re .search (pater , string ):
34+ value_type = mtch .groups ()[0 ]
35+ match value_type :
36+ # TODO: is there a pyarrow function get a type from the string?
37+ case "string" | "large_string" :
38+ return pa .large_list (pa .large_string ())
39+ case "int64" :
40+ return pa .large_list (pa .int64 ())
41+ # TODO: need to implement many more here, including nested
42+
43+ raise ValueError (f"Cannot map { string } to a pyarrow list type" )
44+
45+
3446@register_extension_dtype
3547@set_module ("pandas" )
36- class ListDtype (ExtensionDtype ):
48+ class ListDtype (ArrowDtype ):
3749 """
3850 An ExtensionDtype suitable for storing homogeneous lists of data.
3951 """
4052
41- type = list
42- name : ClassVar [str ] = "list"
53+ def __init__ (self , value_dtype : pa .DataType ) -> None :
54+ super ().__init__ (pa .large_list (value_dtype ))
55+
56+ @classmethod
57+ def construct_from_string (cls , string : str ):
58+ if not isinstance (string , str ):
59+ raise TypeError (
60+ f"'construct_from_string' expects a string, got { type (string )} "
61+ )
62+
63+ try :
64+ pa_type = string_to_pyarrow_type (string )
65+ except ValueError as e :
66+ raise TypeError (
67+ f"Cannot construct a '{ cls .__name__ } ' from '{ string } '"
68+ ) from e
69+
70+ return cls (pa_type )
4371
4472 @property
45- def na_value (self ) -> libmissing .NAType :
46- return libmissing .NA
73+ def name (self ) -> str : # type: ignore[override]
74+ """
75+ A string identifying the data type.
76+ """
77+ return f"list[{ self .pyarrow_dtype .value_type !s} ]"
4778
4879 @property
4980 def kind (self ) -> str :
50- # TODO: our extension interface says this field should be the
81+ # TODO(wayd) : our extension interface says this field should be the
5182 # NumPy type character, but no such thing exists for list
52- # this assumes a PyArrow large list
83+ # This uses the Arrow C Data exchange code instead
5384 return "+L"
5485
5586 @classmethod
@@ -64,22 +95,34 @@ def construct_array_type(cls) -> type_t[ListArray]:
6495 return ListArray
6596
6697
67- class ListArray (ExtensionArray ):
68- dtype = ListDtype ()
98+ class ListArray (ArrowExtensionArray ):
6999 __array_priority__ = 1000
70100
71- def __init__ (self , values : pa .Array | pa .ChunkedArray | list | ListArray ) -> None :
101+ def __init__ (
102+ self , values : pa .Array | pa .ChunkedArray | list | ListArray , value_type = None
103+ ) -> None :
72104 if not HAS_PYARROW :
73105 raise NotImplementedError ("ListArray requires pyarrow to be installed" )
74106
75107 if isinstance (values , type (self )):
76108 self ._pa_array = values ._pa_array
77- elif not isinstance (values , pa .ChunkedArray ):
78- # To support NA, we need to create an Array first :-(
79- arr = pa .array (values , from_pandas = True )
80- self ._pa_array = pa .chunked_array (arr )
81109 else :
82- self ._pa_array = values
110+ if value_type is None :
111+ if isinstance (values , (pa .Array , pa .ChunkedArray )):
112+ value_type = values .type .value_type
113+ else :
114+ value_type = pa .array (values ).type .value_type
115+
116+ if not isinstance (values , pa .ChunkedArray ):
117+ # To support NA, we need to create an Array first :-(
118+ arr = pa .array (values , type = pa .large_list (value_type ), from_pandas = True )
119+ self ._pa_array = pa .chunked_array (arr , type = pa .large_list (value_type ))
120+ else :
121+ self ._pa_array = values
122+
123+ @property
124+ def _dtype (self ):
125+ return ListDtype (self ._pa_array .type .value_type )
83126
84127 @classmethod
85128 def _from_sequence (cls , scalars , * , dtype = None , copy : bool = False ):
@@ -100,10 +143,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
100143 scalars [i ] = None
101144
102145 values = pa .array (scalars , from_pandas = True )
103- if values .type == "null" :
104- # TODO(wayd): this is a hack to get the tests to pass, but the overall issue
105- # is that our extension types don't support parametrization but the pyarrow
106- values = pa .array (values , type = pa .list_ (pa .null ()))
146+
147+ if values .type == "null" and dtype is not None :
148+ # TODO: the sequencing here seems wrong; just making the tests pass for now
149+ # but this needs a comprehensive review
150+ pa_type = string_to_pyarrow_type (str (dtype ))
151+ values = pa .array (values , type = pa_type )
107152
108153 return cls (values )
109154
@@ -114,21 +159,13 @@ def __getitem__(self, item):
114159 pos = np .array (range (len (item )))
115160 mask = pos [item ]
116161 return type (self )(self ._pa_array .take (mask ))
117- elif isinstance (item , int ): # scalar case
162+ elif isinstance (item , int ):
118163 return self ._pa_array [item ]
164+ elif isinstance (item , list ):
165+ return type (self )(self ._pa_array .take (item ))
119166
120167 return type (self )(self ._pa_array [item ])
121168
122- def __len__ (self ) -> int :
123- return len (self ._pa_array )
124-
125- def isna (self ):
126- return np .array (self ._pa_array .is_null ())
127-
128- def take (self , indexer , allow_fill = False , fill_value = None ):
129- # TODO: what do we need to do with allow_fill and fill_value here?
130- return type (self )(self ._pa_array .take (indexer ))
131-
132169 @classmethod
133170 def _empty (cls , shape : Shape , dtype : ExtensionDtype ):
134171 """
@@ -149,32 +186,5 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype):
149186 length = shape [0 ]
150187 else :
151188 length = shape
152- return cls ._from_sequence ([None ] * length , dtype = pa .list_ (pa .null ()))
153189
154- def copy (self ):
155- mm = pa .default_cpu_memory_manager ()
156-
157- # TODO(wayd): ChunkedArray does not implement copy_to so this
158- # ends up creating an Array
159- copied = self ._pa_array .combine_chunks ().copy_to (mm .device )
160- return type (self )(copied )
161-
162- def astype (self , dtype , copy = True ):
163- if isinstance (dtype , type (self .dtype )) and dtype == self .dtype :
164- if copy :
165- return self .copy ()
166- return self
167- elif is_string_dtype (dtype ) and not is_object_dtype (dtype ):
168- # numpy has problems with astype(str) for nested elements
169- # and pyarrow cannot cast from list[string] to string
170- return np .array ([str (x ) for x in self ._pa_array ], dtype = dtype )
171-
172- if not copy :
173- raise TypeError (f"astype from ListArray to { dtype } requires a copy" )
174-
175- return np .array (self ._pa_array .to_pylist (), dtype = dtype , copy = copy )
176-
177- @classmethod
178- def _concat_same_type (cls , to_concat ):
179- data = [x ._pa_array for x in to_concat ]
180- return cls (data )
190+ return cls ._from_sequence ([None ] * length , dtype = dtype )
0 commit comments