mars-project
diff --git a/‎docs/source/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mars/dataframe/arrays.py‎
Lines changed: 9 additions & 2 deletions b/‎mars/dataframe/arrays.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎mars/dataframe/base/apply.py‎
Lines changed: 17 additions & 3 deletions b/‎mars/dataframe/base/apply.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎mars/dataframe/base/memory_usage.py‎
Lines changed: 2 additions & 2 deletions b/‎mars/dataframe/base/memory_usage.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mars/dataframe/base/tests/test_base_execution.py‎
Lines changed: 6 additions & 4 deletions b/‎mars/dataframe/base/tests/test_base_execution.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎mars/dataframe/base/to_numeric.py‎
Lines changed: 1 addition & 1 deletion b/‎mars/dataframe/base/to_numeric.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mars/dataframe/datasource/index.py‎
Lines changed: 1 addition & 1 deletion b/‎mars/dataframe/datasource/index.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mars/dataframe/groupby/head.py‎
Lines changed: 47 additions & 41 deletions b/‎mars/dataframe/groupby/head.py‎
Lines changed: 47 additions & 41 deletions
diff --git a/‎mars/dataframe/groupby/tests/test_groupby_execution.py‎
Lines changed: 10 additions & 0 deletions b/‎mars/dataframe/groupby/tests/test_groupby_execution.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎mars/dataframe/indexing/getitem.py‎
Lines changed: 1 addition & 1 deletion b/‎mars/dataframe/indexing/getitem.py‎
Lines changed: 1 addition & 1 deletion
@@ -193,6 +193,7 @@
 
 locale_dirs = ['locale/']   # path is example but recommended.
 gettext_compact = False     # optional.
+ipython_warning_is_error = False
 
 
 import sphinx
 
@@ -56,7 +56,8 @@
 from ..core import is_kernel_mode
 from ..utils import pd_release_version, tokenize
 
-_use_bool_any_all = pd_release_version >= (1, 3, 0)
+_use_bool_any_all = pd_release_version[:2] >= (1, 3)
+_use_extension_index = pd_release_version[:2] >= (1, 4)
 
 
 class ArrowDtype(ExtensionDtype):
@@ -497,8 +498,14 @@ def astype(self, dtype, copy=True):
         # try to slice 1 record to get the result dtype
         test_array = self._arrow_array.slice(0, 1).to_pandas()
         test_result_array = test_array.astype(dtype).array
+        if _use_extension_index:
+            test_result_type = type(test_array.astype(dtype).values)
+            if test_result_type is np.ndarray:
+                test_result_type = np.array
+        else:
+            test_result_type = type(test_result_array)
 
-        result_array = type(test_result_array)(
+        result_array = test_result_type(
             np.full(
                 self.shape,
                 test_result_array.dtype.na_value,
 
@@ -30,6 +30,7 @@
     FunctionField,
 )
 from ...utils import enter_current_session, quiet_stdio
+from ..arrays import ArrowArray
 from ..operands import DataFrameOperandMixin, DataFrameOperand
 from ..utils import (
     build_df,
@@ -138,9 +139,22 @@ def execute(cls, ctx, op):
                 **op.kwds,
             )
         else:
-            result = input_data.apply(
-                op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
-            )
+            try:
+                result = input_data.apply(
+                    op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
+                )
+            except TypeError:
+                if isinstance(input_data.values, ArrowArray):
+                    input_data = pd.Series(
+                        input_data.to_numpy(),
+                        name=input_data.name,
+                        index=input_data.index,
+                    )
+                    result = input_data.apply(
+                        op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
+                    )
+                else:  # pragma: no cover
+                    raise
         ctx[out.key] = result
 
     @classmethod
 
@@ -171,7 +171,7 @@ def _tile_dataframe(cls, op: "DataFrameMemoryUsage"):
 
         # produce map chunks
         # allocate matrix of chunks
-        chunks_to_reduce = np.empty(shape=df.chunk_shape, dtype=np.object)
+        chunks_to_reduce = np.empty(shape=df.chunk_shape, dtype=object)
         for c in df.chunks:
             new_op = op.copy().reset_key()
             new_op.stage = OperandStage.map
@@ -205,7 +205,7 @@ def _tile_dataframe(cls, op: "DataFrameMemoryUsage"):
                     ceildiv(chunks_to_reduce.shape[0], combine_size),
                     chunks_to_reduce.shape[1],
                 ),
-                dtype=np.object,
+                dtype=object,
             )
             for idx in range(0, chunks_to_reduce.shape[0], combine_size):
                 for idx2 in range(chunks_to_reduce.shape[1]):
 
@@ -565,21 +565,23 @@ def rename_fn(f, new_name):
 
 @pytest.mark.skipif(pa is None, reason="pyarrow not installed")
 def test_transform_with_arrow_dtype_execution(setup):
-    df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
-    df = from_pandas_df(df1)
+    raw = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
+    df = from_pandas_df(raw)
     df["b"] = df["b"].astype("Arrow[string]")
 
     r = df.transform({"b": lambda x: x + "_suffix"})
     result = r.execute().fetch()
-    expected = df1.transform({"b": lambda x: x + "_suffix"})
+    result["b"] = result["b"].to_numpy()
+    expected = raw.transform({"b": lambda x: x + "_suffix"})
     pd.testing.assert_frame_equal(result, expected)
 
-    s1 = df1["b"]
+    s1 = raw["b"]
     s = from_pandas_series(s1)
     s = s.astype("arrow_string")
 
     r = s.transform(lambda x: x + "_suffix")
     result = r.execute().fetch()
+    result = pd.Series(result.to_numpy(), name=result.name, index=result.index)
     expected = s1.transform(lambda x: x + "_suffix")
     pd.testing.assert_series_equal(result, expected)
 
 
@@ -55,7 +55,7 @@ def __call__(self, arg):
             self.output_types = [OutputType.tensor]
             dtype = tensor.dtype
             if dtype.kind == "U":
-                dtype = np.dtype(np.object_)
+                dtype = np.dtype(object)
             return self.new_tileables([tensor], shape=tensor.shape, dtype=dtype)[0]
 
     @classmethod
 
@@ -240,7 +240,7 @@ def execute(cls, ctx, op):
         else:
             out = op.outputs[0]
             inp = ctx[op.inputs[0].key]
-            dtype = out.dtype if out.dtype != np.object else None
+            dtype = out.dtype if out.dtype != object else None
             if hasattr(inp, "index"):
                 # DataFrame, Series
                 ctx[out.key] = pd.Index(inp.index, dtype=dtype, name=out.name)
 
@@ -17,29 +17,22 @@
 
 from ... import opcodes
 from ...core import OutputType, get_output_types, recursive_tile
-from ...serialization.serializables import DictField, Int64Field
+from ...serialization.serializables import DictField, Int64Field, BoolField
+from ...utils import pd_release_version
 from ..core import IndexValue
 from ..operands import DataFrameOperandMixin, DataFrameOperand
 from ..utils import build_concatenated_rows_frame, parse_index
 
+_pandas_enable_negative = pd_release_version >= (1, 4, 0)
+
 
 class GroupByHead(DataFrameOperand, DataFrameOperandMixin):
     _op_type_ = opcodes.GROUPBY_HEAD
     _op_module_ = "dataframe.groupby"
 
-    _row_count = Int64Field("row_count")
-    _groupby_params = DictField("groupby_params")
-
-    def __init__(self, row_count=None, groupby_params=None, **kw):
-        super().__init__(_row_count=row_count, _groupby_params=groupby_params, **kw)
-
-    @property
-    def row_count(self) -> int:
-        return self._row_count
-
-    @property
-    def groupby_params(self) -> dict:
-        return self._groupby_params
+    row_count = Int64Field("row_count")
+    groupby_params = DictField("groupby_params")
+    enable_negative = BoolField("enable_negative")
 
     def __call__(self, groupby):
         df = groupby
@@ -72,30 +65,32 @@ def tile(cls, op: "GroupByHead"):
         groupby_params = op.groupby_params.copy()
         selection = groupby_params.pop("selection", None)
 
+        enable_negative = _pandas_enable_negative and op.enable_negative
+
         if len(in_df.shape) > 1:
             in_df = build_concatenated_rows_frame(in_df)
         out_df = op.outputs[0]
 
-        # when row_count is not positive or there is only one chunk,
-        # tile with a single chunk
-        if op.row_count <= 0 or len(in_df.chunks) == 0:
+        # when row_count is not positive and pandas does not support negative head,
+        #  or there is only one chunk, tile with a single chunk
+        if (not enable_negative and op.row_count <= 0) or len(in_df.chunks) <= 1:
+            row_num = 0 if not enable_negative and op.row_count <= 0 else np.nan
+            new_shape = (row_num,)
+            new_nsplits = ((row_num,),)
+            if out_df.ndim > 1:
+                new_shape += (out_df.shape[1],)
+                new_nsplits += ((out_df.shape[1],),)
+
             c = in_df.chunks[0]
             chunk_op = op.copy().reset_key()
-            params = c.params
-            row_num = 0 if op.row_count <= 0 else np.nan
-            params["shape"] = (row_num,) + c.shape[1:]
-            params["index_value"] = out_df.index_value
+            params = out_df.params
+            params["shape"] = new_shape
+            params["index"] = (0,) * out_df.ndim
             out_chunk = chunk_op.new_chunk([c], **params)
 
             tileable_op = op.copy().reset_key()
-            params = out_df.params
-            params["shape"] = (row_num,) + c.shape[1:]
-            params["index_value"] = out_df.index_value
             return tileable_op.new_tileables(
-                [in_df],
-                nsplits=((row_num,),) + in_df.nsplits[1:],
-                chunks=[out_chunk],
-                **params
+                [in_df], nsplits=new_nsplits, chunks=[out_chunk], **params
             )
 
         if in_df.ndim > 1 and selection:
@@ -116,15 +111,19 @@ def tile(cls, op: "GroupByHead"):
                 in_df = yield from recursive_tile(in_df[pre_selection])
 
         # generate pre chunks
-        pre_chunks = []
-        for c in in_df.chunks:
-            pre_op = op.copy().reset_key()
-            pre_op._output_types = get_output_types(c)
-            pre_op._groupby_params = op.groupby_params.copy()
-            pre_op._groupby_params.pop("selection", None)
-            params = c.params
-            params["shape"] = (np.nan,) + c.shape[1:]
-            pre_chunks.append(pre_op.new_chunk([c], **params))
+        if op.row_count < 0:
+            # when we have negative row counts, pre-groupby optimization is not possible
+            pre_chunks = in_df.chunks
+        else:
+            pre_chunks = []
+            for c in in_df.chunks:
+                pre_op = op.copy().reset_key()
+                pre_op._output_types = get_output_types(c)
+                pre_op.groupby_params = op.groupby_params.copy()
+                pre_op.groupby_params.pop("selection", None)
+                params = c.params
+                params["shape"] = (np.nan,) + c.shape[1:]
+                pre_chunks.append(pre_op.new_chunk([c], **params))
 
         new_op = op.copy().reset_key()
         new_op._output_types = get_output_types(in_df)
@@ -142,8 +141,8 @@ def tile(cls, op: "GroupByHead"):
         post_chunks = []
         for c in grouped.chunks:
             post_op = op.copy().reset_key()
-            post_op._groupby_params = op.groupby_params.copy()
-            post_op._groupby_params.pop("selection", None)
+            post_op.groupby_params = op.groupby_params.copy()
+            post_op.groupby_params.pop("selection", None)
             if op.output_types[0] == OutputType.dataframe:
                 index = c.index
             else:
@@ -175,7 +174,10 @@ def execute(cls, ctx, op: "GroupByHead"):
         if selection:
             grouped = grouped[selection]
 
-        ctx[op.outputs[0].key] = grouped.head(op.row_count)
+        result = grouped.head(op.row_count)
+        if not op.enable_negative and op.row_count < 0:
+            result = result.iloc[:0]
+        ctx[op.outputs[0].key] = result
 
 
 def head(groupby, n=5):
@@ -215,5 +217,9 @@ def head(groupby, n=5):
     groupby_params = groupby.op.groupby_params.copy()
     groupby_params.pop("as_index", None)
 
-    op = GroupByHead(row_count=n, groupby_params=groupby_params)
+    op = GroupByHead(
+        row_count=n,
+        groupby_params=groupby_params,
+        enable_negative=_pandas_enable_negative,
+    )
     return op(groupby)
@@ -869,6 +869,11 @@ def test_groupby_head(setup):
         r.execute().fetch().sort_index(), df1.groupby("b").head(1)
     )
 
+    r = mdf.groupby("b").head(-1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b").head(-1)
+    )
+
     # test head with selection
     r = mdf.groupby("b")["a", "d"].head(1)
     pd.testing.assert_frame_equal(
@@ -1036,6 +1041,7 @@ def test_groupby_agg_with_arrow_dtype(setup):
 
     r = mdf.groupby("b").count()
     result = r.execute().fetch()
+    result.index = result.index.astype(object)
     expected = df1.groupby("b").count()
     pd.testing.assert_frame_equal(result, expected)
 
@@ -1044,6 +1050,7 @@ def test_groupby_agg_with_arrow_dtype(setup):
 
     r = mseries.groupby(mseries).count()
     result = r.execute().fetch()
+    result.index = result.index.astype(object)
     expected = series1.groupby(series1).count()
     pd.testing.assert_series_equal(result, expected)
 
@@ -1053,6 +1060,7 @@ def test_groupby_agg_with_arrow_dtype(setup):
 
     r = mseries.groupby(mseries).count()
     result = r.execute().fetch()
+    result.index = result.index.astype(object)
     expected = series2.groupby(series2).count()
     pd.testing.assert_series_equal(result, expected)
 
@@ -1065,6 +1073,7 @@ def test_groupby_apply_with_arrow_dtype(setup):
 
     applied = mdf.groupby("b").apply(lambda df: df.a.sum())
     result = applied.execute().fetch()
+    result.index = result.index.astype(object)
     expected = df1.groupby("b").apply(lambda df: df.a.sum())
     pd.testing.assert_series_equal(result, expected)
 
@@ -1073,5 +1082,6 @@ def test_groupby_apply_with_arrow_dtype(setup):
 
     applied = mseries.groupby(mseries).apply(lambda s: s)
     result = applied.execute().fetch()
+    result.index = result.index.astype(np.int64)
     expected = series1.groupby(series1).apply(lambda s: s)
     pd.testing.assert_series_equal(arrow_array_to_objects(result), expected)
@@ -574,7 +574,7 @@ def series_getitem(series, labels, combine_size=None):
     if isinstance(labels, list) or np.isscalar(labels):
         op = SeriesIndex(labels=labels, combine_size=combine_size)
         return op(series, name=series.name)
-    elif isinstance(labels, _list_like_types) and astensor(labels).dtype == np.bool:
+    elif isinstance(labels, _list_like_types) and astensor(labels).dtype == np.bool_:
         return series.loc[labels]
     elif isinstance(labels, slice):
         edge = labels.start if labels.start is not None else labels.stop