Skip to content

Commit 3c0a4ca

Browse files
authored
Fix compatibility for pandas 1.4 (#2650)
1 parent f2c9c4c commit 3c0a4ca

File tree

29 files changed

+195
-100
lines changed

29 files changed

+195
-100
lines changed

docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@
193193

194194
locale_dirs = ['locale/'] # path is example but recommended.
195195
gettext_compact = False # optional.
196+
ipython_warning_is_error = False
196197

197198

198199
import sphinx

mars/dataframe/arrays.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@
5656
from ..core import is_kernel_mode
5757
from ..utils import pd_release_version, tokenize
5858

59-
_use_bool_any_all = pd_release_version >= (1, 3, 0)
59+
_use_bool_any_all = pd_release_version[:2] >= (1, 3)
60+
_use_extension_index = pd_release_version[:2] >= (1, 4)
6061

6162

6263
class ArrowDtype(ExtensionDtype):
@@ -497,8 +498,14 @@ def astype(self, dtype, copy=True):
497498
# try to slice 1 record to get the result dtype
498499
test_array = self._arrow_array.slice(0, 1).to_pandas()
499500
test_result_array = test_array.astype(dtype).array
501+
if _use_extension_index:
502+
test_result_type = type(test_array.astype(dtype).values)
503+
if test_result_type is np.ndarray:
504+
test_result_type = np.array
505+
else:
506+
test_result_type = type(test_result_array)
500507

501-
result_array = type(test_result_array)(
508+
result_array = test_result_type(
502509
np.full(
503510
self.shape,
504511
test_result_array.dtype.na_value,

mars/dataframe/base/apply.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
FunctionField,
3131
)
3232
from ...utils import enter_current_session, quiet_stdio
33+
from ..arrays import ArrowArray
3334
from ..operands import DataFrameOperandMixin, DataFrameOperand
3435
from ..utils import (
3536
build_df,
@@ -138,9 +139,22 @@ def execute(cls, ctx, op):
138139
**op.kwds,
139140
)
140141
else:
141-
result = input_data.apply(
142-
op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
143-
)
142+
try:
143+
result = input_data.apply(
144+
op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
145+
)
146+
except TypeError:
147+
if isinstance(input_data.values, ArrowArray):
148+
input_data = pd.Series(
149+
input_data.to_numpy(),
150+
name=input_data.name,
151+
index=input_data.index,
152+
)
153+
result = input_data.apply(
154+
op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
155+
)
156+
else: # pragma: no cover
157+
raise
144158
ctx[out.key] = result
145159

146160
@classmethod

mars/dataframe/base/memory_usage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def _tile_dataframe(cls, op: "DataFrameMemoryUsage"):
171171

172172
# produce map chunks
173173
# allocate matrix of chunks
174-
chunks_to_reduce = np.empty(shape=df.chunk_shape, dtype=np.object)
174+
chunks_to_reduce = np.empty(shape=df.chunk_shape, dtype=object)
175175
for c in df.chunks:
176176
new_op = op.copy().reset_key()
177177
new_op.stage = OperandStage.map
@@ -205,7 +205,7 @@ def _tile_dataframe(cls, op: "DataFrameMemoryUsage"):
205205
ceildiv(chunks_to_reduce.shape[0], combine_size),
206206
chunks_to_reduce.shape[1],
207207
),
208-
dtype=np.object,
208+
dtype=object,
209209
)
210210
for idx in range(0, chunks_to_reduce.shape[0], combine_size):
211211
for idx2 in range(chunks_to_reduce.shape[1]):

mars/dataframe/base/tests/test_base_execution.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -565,21 +565,23 @@ def rename_fn(f, new_name):
565565

566566
@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
567567
def test_transform_with_arrow_dtype_execution(setup):
568-
df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
569-
df = from_pandas_df(df1)
568+
raw = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
569+
df = from_pandas_df(raw)
570570
df["b"] = df["b"].astype("Arrow[string]")
571571

572572
r = df.transform({"b": lambda x: x + "_suffix"})
573573
result = r.execute().fetch()
574-
expected = df1.transform({"b": lambda x: x + "_suffix"})
574+
result["b"] = result["b"].to_numpy()
575+
expected = raw.transform({"b": lambda x: x + "_suffix"})
575576
pd.testing.assert_frame_equal(result, expected)
576577

577-
s1 = df1["b"]
578+
s1 = raw["b"]
578579
s = from_pandas_series(s1)
579580
s = s.astype("arrow_string")
580581

581582
r = s.transform(lambda x: x + "_suffix")
582583
result = r.execute().fetch()
584+
result = pd.Series(result.to_numpy(), name=result.name, index=result.index)
583585
expected = s1.transform(lambda x: x + "_suffix")
584586
pd.testing.assert_series_equal(result, expected)
585587

mars/dataframe/base/to_numeric.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def __call__(self, arg):
5555
self.output_types = [OutputType.tensor]
5656
dtype = tensor.dtype
5757
if dtype.kind == "U":
58-
dtype = np.dtype(np.object_)
58+
dtype = np.dtype(object)
5959
return self.new_tileables([tensor], shape=tensor.shape, dtype=dtype)[0]
6060

6161
@classmethod

mars/dataframe/datasource/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def execute(cls, ctx, op):
240240
else:
241241
out = op.outputs[0]
242242
inp = ctx[op.inputs[0].key]
243-
dtype = out.dtype if out.dtype != np.object else None
243+
dtype = out.dtype if out.dtype != object else None
244244
if hasattr(inp, "index"):
245245
# DataFrame, Series
246246
ctx[out.key] = pd.Index(inp.index, dtype=dtype, name=out.name)

mars/dataframe/groupby/head.py

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,22 @@
1717

1818
from ... import opcodes
1919
from ...core import OutputType, get_output_types, recursive_tile
20-
from ...serialization.serializables import DictField, Int64Field
20+
from ...serialization.serializables import DictField, Int64Field, BoolField
21+
from ...utils import pd_release_version
2122
from ..core import IndexValue
2223
from ..operands import DataFrameOperandMixin, DataFrameOperand
2324
from ..utils import build_concatenated_rows_frame, parse_index
2425

26+
_pandas_enable_negative = pd_release_version >= (1, 4, 0)
27+
2528

2629
class GroupByHead(DataFrameOperand, DataFrameOperandMixin):
2730
_op_type_ = opcodes.GROUPBY_HEAD
2831
_op_module_ = "dataframe.groupby"
2932

30-
_row_count = Int64Field("row_count")
31-
_groupby_params = DictField("groupby_params")
32-
33-
def __init__(self, row_count=None, groupby_params=None, **kw):
34-
super().__init__(_row_count=row_count, _groupby_params=groupby_params, **kw)
35-
36-
@property
37-
def row_count(self) -> int:
38-
return self._row_count
39-
40-
@property
41-
def groupby_params(self) -> dict:
42-
return self._groupby_params
33+
row_count = Int64Field("row_count")
34+
groupby_params = DictField("groupby_params")
35+
enable_negative = BoolField("enable_negative")
4336

4437
def __call__(self, groupby):
4538
df = groupby
@@ -72,30 +65,32 @@ def tile(cls, op: "GroupByHead"):
7265
groupby_params = op.groupby_params.copy()
7366
selection = groupby_params.pop("selection", None)
7467

68+
enable_negative = _pandas_enable_negative and op.enable_negative
69+
7570
if len(in_df.shape) > 1:
7671
in_df = build_concatenated_rows_frame(in_df)
7772
out_df = op.outputs[0]
7873

79-
# when row_count is not positive or there is only one chunk,
80-
# tile with a single chunk
81-
if op.row_count <= 0 or len(in_df.chunks) == 0:
74+
# when row_count is not positive and pandas does not support negative head,
75+
# or there is only one chunk, tile with a single chunk
76+
if (not enable_negative and op.row_count <= 0) or len(in_df.chunks) <= 1:
77+
row_num = 0 if not enable_negative and op.row_count <= 0 else np.nan
78+
new_shape = (row_num,)
79+
new_nsplits = ((row_num,),)
80+
if out_df.ndim > 1:
81+
new_shape += (out_df.shape[1],)
82+
new_nsplits += ((out_df.shape[1],),)
83+
8284
c = in_df.chunks[0]
8385
chunk_op = op.copy().reset_key()
84-
params = c.params
85-
row_num = 0 if op.row_count <= 0 else np.nan
86-
params["shape"] = (row_num,) + c.shape[1:]
87-
params["index_value"] = out_df.index_value
86+
params = out_df.params
87+
params["shape"] = new_shape
88+
params["index"] = (0,) * out_df.ndim
8889
out_chunk = chunk_op.new_chunk([c], **params)
8990

9091
tileable_op = op.copy().reset_key()
91-
params = out_df.params
92-
params["shape"] = (row_num,) + c.shape[1:]
93-
params["index_value"] = out_df.index_value
9492
return tileable_op.new_tileables(
95-
[in_df],
96-
nsplits=((row_num,),) + in_df.nsplits[1:],
97-
chunks=[out_chunk],
98-
**params
93+
[in_df], nsplits=new_nsplits, chunks=[out_chunk], **params
9994
)
10095

10196
if in_df.ndim > 1 and selection:
@@ -116,15 +111,19 @@ def tile(cls, op: "GroupByHead"):
116111
in_df = yield from recursive_tile(in_df[pre_selection])
117112

118113
# generate pre chunks
119-
pre_chunks = []
120-
for c in in_df.chunks:
121-
pre_op = op.copy().reset_key()
122-
pre_op._output_types = get_output_types(c)
123-
pre_op._groupby_params = op.groupby_params.copy()
124-
pre_op._groupby_params.pop("selection", None)
125-
params = c.params
126-
params["shape"] = (np.nan,) + c.shape[1:]
127-
pre_chunks.append(pre_op.new_chunk([c], **params))
114+
if op.row_count < 0:
115+
# when we have negative row counts, pre-groupby optimization is not possible
116+
pre_chunks = in_df.chunks
117+
else:
118+
pre_chunks = []
119+
for c in in_df.chunks:
120+
pre_op = op.copy().reset_key()
121+
pre_op._output_types = get_output_types(c)
122+
pre_op.groupby_params = op.groupby_params.copy()
123+
pre_op.groupby_params.pop("selection", None)
124+
params = c.params
125+
params["shape"] = (np.nan,) + c.shape[1:]
126+
pre_chunks.append(pre_op.new_chunk([c], **params))
128127

129128
new_op = op.copy().reset_key()
130129
new_op._output_types = get_output_types(in_df)
@@ -142,8 +141,8 @@ def tile(cls, op: "GroupByHead"):
142141
post_chunks = []
143142
for c in grouped.chunks:
144143
post_op = op.copy().reset_key()
145-
post_op._groupby_params = op.groupby_params.copy()
146-
post_op._groupby_params.pop("selection", None)
144+
post_op.groupby_params = op.groupby_params.copy()
145+
post_op.groupby_params.pop("selection", None)
147146
if op.output_types[0] == OutputType.dataframe:
148147
index = c.index
149148
else:
@@ -175,7 +174,10 @@ def execute(cls, ctx, op: "GroupByHead"):
175174
if selection:
176175
grouped = grouped[selection]
177176

178-
ctx[op.outputs[0].key] = grouped.head(op.row_count)
177+
result = grouped.head(op.row_count)
178+
if not op.enable_negative and op.row_count < 0:
179+
result = result.iloc[:0]
180+
ctx[op.outputs[0].key] = result
179181

180182

181183
def head(groupby, n=5):
@@ -215,5 +217,9 @@ def head(groupby, n=5):
215217
groupby_params = groupby.op.groupby_params.copy()
216218
groupby_params.pop("as_index", None)
217219

218-
op = GroupByHead(row_count=n, groupby_params=groupby_params)
220+
op = GroupByHead(
221+
row_count=n,
222+
groupby_params=groupby_params,
223+
enable_negative=_pandas_enable_negative,
224+
)
219225
return op(groupby)

mars/dataframe/groupby/tests/test_groupby_execution.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,11 @@ def test_groupby_head(setup):
869869
r.execute().fetch().sort_index(), df1.groupby("b").head(1)
870870
)
871871

872+
r = mdf.groupby("b").head(-1)
873+
pd.testing.assert_frame_equal(
874+
r.execute().fetch().sort_index(), df1.groupby("b").head(-1)
875+
)
876+
872877
# test head with selection
873878
r = mdf.groupby("b")["a", "d"].head(1)
874879
pd.testing.assert_frame_equal(
@@ -1036,6 +1041,7 @@ def test_groupby_agg_with_arrow_dtype(setup):
10361041

10371042
r = mdf.groupby("b").count()
10381043
result = r.execute().fetch()
1044+
result.index = result.index.astype(object)
10391045
expected = df1.groupby("b").count()
10401046
pd.testing.assert_frame_equal(result, expected)
10411047

@@ -1044,6 +1050,7 @@ def test_groupby_agg_with_arrow_dtype(setup):
10441050

10451051
r = mseries.groupby(mseries).count()
10461052
result = r.execute().fetch()
1053+
result.index = result.index.astype(object)
10471054
expected = series1.groupby(series1).count()
10481055
pd.testing.assert_series_equal(result, expected)
10491056

@@ -1053,6 +1060,7 @@ def test_groupby_agg_with_arrow_dtype(setup):
10531060

10541061
r = mseries.groupby(mseries).count()
10551062
result = r.execute().fetch()
1063+
result.index = result.index.astype(object)
10561064
expected = series2.groupby(series2).count()
10571065
pd.testing.assert_series_equal(result, expected)
10581066

@@ -1065,6 +1073,7 @@ def test_groupby_apply_with_arrow_dtype(setup):
10651073

10661074
applied = mdf.groupby("b").apply(lambda df: df.a.sum())
10671075
result = applied.execute().fetch()
1076+
result.index = result.index.astype(object)
10681077
expected = df1.groupby("b").apply(lambda df: df.a.sum())
10691078
pd.testing.assert_series_equal(result, expected)
10701079

@@ -1073,5 +1082,6 @@ def test_groupby_apply_with_arrow_dtype(setup):
10731082

10741083
applied = mseries.groupby(mseries).apply(lambda s: s)
10751084
result = applied.execute().fetch()
1085+
result.index = result.index.astype(np.int64)
10761086
expected = series1.groupby(series1).apply(lambda s: s)
10771087
pd.testing.assert_series_equal(arrow_array_to_objects(result), expected)

mars/dataframe/indexing/getitem.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@ def series_getitem(series, labels, combine_size=None):
574574
if isinstance(labels, list) or np.isscalar(labels):
575575
op = SeriesIndex(labels=labels, combine_size=combine_size)
576576
return op(series, name=series.name)
577-
elif isinstance(labels, _list_like_types) and astensor(labels).dtype == np.bool:
577+
elif isinstance(labels, _list_like_types) and astensor(labels).dtype == np.bool_:
578578
return series.loc[labels]
579579
elif isinstance(labels, slice):
580580
edge = labels.start if labels.start is not None else labels.stop

0 commit comments

Comments
 (0)