[Operand] support loc setitem (#3291)

chaokunyang · web-flow · commit 394950650126 · 2022-12-08T20:35:29.000+08:00
* support loc setitem

* check column splits

* fix iloc indexes

* refine test_ownership_when_scale_in ut

* fix process_loc_indexes

* lint code

* fix loc to iloc

* add loc row index test
diff --git a/mars/dataframe/indexing/loc.py b/mars/dataframe/indexing/loc.py
@@ -20,23 +20,24 @@
 from pandas.core.dtypes.cast import find_common_type
 from pandas.core.indexing import IndexingError
 
+from .iloc import DataFrameIlocSetItem
 from ... import opcodes as OperandDef
-from ...core import ENTITY_TYPE
+from ...core import ENTITY_TYPE, OutputType
 from ...core.operand import OperandStage
-from ...serialization.serializables import KeyField, ListField
+from ...serialization.serializables import KeyField, ListField, AnyField
 from ...tensor.datasource import asarray
 from ...tensor.utils import calc_sliced_size, filter_inputs
 from ...utils import lazy_import, is_full_slice
 from ..core import IndexValue, DATAFRAME_TYPE
 from ..operands import DataFrameOperand, DataFrameOperandMixin
-from ..utils import parse_index
+from ..utils import parse_index, is_index_value_identical
 from .index_lib import DataFrameLocIndexesHandler
 
 
 cudf = lazy_import("cudf")
 
 
-def process_loc_indexes(inp, indexes):
+def process_loc_indexes(inp, indexes, fetch_index: bool = True):
     ndim = inp.ndim
 
     if not isinstance(indexes, tuple):
@@ -51,7 +52,7 @@ def process_loc_indexes(inp, indexes):
         if isinstance(index, (list, np.ndarray, pd.Series, ENTITY_TYPE)):
             if not isinstance(index, ENTITY_TYPE):
                 index = np.asarray(index)
-            else:
+            elif fetch_index:
                 index = asarray(index)
                 if ax == 1:
                     # do not support tensor index on axis 1
@@ -116,6 +117,125 @@ def __getitem__(self, indexes):
         op = DataFrameLocGetItem(indexes=indexes)
         return op(self._obj)
 
+    def __setitem__(self, indexes, value):
+        if not np.isscalar(value):
+            raise NotImplementedError("Only scalar value is supported to set by loc")
+        if not isinstance(self._obj, DATAFRAME_TYPE):
+            raise NotImplementedError("Only DataFrame is supported to set by loc")
+        indexes = process_loc_indexes(self._obj, indexes, fetch_index=False)
+        use_iloc, new_indexes = self._use_iloc(indexes)
+        if use_iloc:
+            op = DataFrameIlocSetItem(indexes=new_indexes, value=value)
+            ret = op(self._obj)
+            self._obj.data = ret.data
+        else:
+            other_indices = []
+            indices_tileable = [
+                idx
+                for idx in indexes
+                if isinstance(idx, ENTITY_TYPE) or other_indices.append(idx)
+            ]
+            op = DataFramelocSetItem(indexes=other_indices, value=value)
+            ret = op([self._obj] + indices_tileable)
+            self._obj.data = ret.data
+
+
+class DataFramelocSetItem(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM
+
+    _indexes = ListField("indexes")
+    _value = AnyField("value")
+
+    def __init__(
+        self, indexes=None, value=None, gpu=None, sparse=False, output_types=None, **kw
+    ):
+        super().__init__(
+            _indexes=indexes,
+            _value=value,
+            gpu=gpu,
+            sparse=sparse,
+            _output_types=output_types,
+            **kw,
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.dataframe]
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def value(self):
+        return self._value
+
+    def __call__(self, inputs):
+        df = inputs[0]
+        return self.new_dataframe(
+            inputs,
+            shape=df.shape,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        out_chunks = []
+        if len(op.inputs) > 1:
+            index_series = op.inputs[1]
+            is_identical = is_index_value_identical(in_df, index_series)
+            if not is_identical:
+                raise NotImplementedError("Only identical index value is supported")
+            if len(in_df.nsplits[1]) != 1:
+                raise NotImplementedError("Column-split chunks are not supported")
+            for target_chunk, index_chunk in zip(in_df.chunks, index_series.chunks):
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [target_chunk, index_chunk],
+                    shape=target_chunk.shape,
+                    index=target_chunk.index,
+                    dtypes=target_chunk.dtypes,
+                    index_value=target_chunk.index_value,
+                    columns_value=target_chunk.columns_value,
+                )
+                out_chunks.append(out_chunk)
+        else:
+            for target_chunk in in_df.chunks:
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [target_chunk],
+                    shape=target_chunk.shape,
+                    index=target_chunk.index,
+                    dtypes=target_chunk.dtypes,
+                    index_value=target_chunk.index_value,
+                    columns_value=target_chunk.columns_value,
+                )
+                out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=in_df.nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        r = ctx[op.inputs[0].key].copy(deep=True)
+        if len(op.inputs) > 1:
+            row_index = ctx[op.inputs[1].key]
+            r.loc[(row_index,) + tuple(op.indexes)] = op.value
+        else:
+            r.loc[tuple(op.indexes)] = op.value
+        ctx[chunk.key] = r
+
 
 class DataFrameLocGetItem(DataFrameOperand, DataFrameOperandMixin):
     _op_type_ = OperandDef.DATAFRAME_LOC_GETITEM
diff --git a/mars/dataframe/indexing/tests/test_indexing_execution.py b/mars/dataframe/indexing/tests/test_indexing_execution.py
@@ -1636,6 +1636,27 @@ def test_sample_execution(setup):
     pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch())
 
 
+def test_loc_setitem(setup):
+    raw_df = pd.DataFrame({"a": [1, 2, 3, 4, 2, 4, 5, 7, 2, 8, 9], 1: [10] * 11})
+    md_data = md.DataFrame(raw_df, chunk_size=3)
+    md_data.loc[md_data["a"] <= 4, 1] = "v1"
+    pd_data = raw_df.copy(True)
+    pd_data.loc[pd_data["a"] <= 4, 1] = "v1"
+    pd.testing.assert_frame_equal(md_data.to_pandas(), pd_data)
+
+    md_data1 = md.DataFrame(raw_df, chunk_size=3)
+    md_data1.loc[1:3] = "v2"
+    pd_data1 = raw_df.copy(True)
+    pd_data1.loc[1:3] = "v2"
+    pd.testing.assert_frame_equal(md_data1.to_pandas(), pd_data1)
+
+    md_data2 = md.DataFrame(raw_df, chunk_size=3)
+    md_data2.loc[1:3, 1] = "v2"
+    pd_data2 = raw_df.copy(True)
+    pd_data2.loc[1:3, 1] = "v2"
+    pd.testing.assert_frame_equal(md_data2.to_pandas(), pd_data2)
+
+
 def test_add_prefix_suffix(setup):
     rs = np.random.RandomState(0)
     raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"])
diff --git a/mars/deploy/oscar/tests/test_ray_scheduling.py b/mars/deploy/oscar/tests/test_ray_scheduling.py
@@ -247,9 +247,9 @@ async def test_ownership_when_scale_in(ray_large_cluster):
         supervisor_mem=200 * 1024**2,
         config={
             "scheduling.autoscale.enabled": True,
-            "scheduling.autoscale.scheduler_check_interval": 1,
-            "scheduling.autoscale.scheduler_backlog_timeout": 1,
-            "scheduling.autoscale.worker_idle_timeout": 10,
+            "scheduling.autoscale.scheduler_check_interval": 0.1,
+            "scheduling.autoscale.scheduler_backlog_timeout": 0.5,
+            "scheduling.autoscale.worker_idle_timeout": 1,
             "scheduling.autoscale.min_workers": 1,
             "scheduling.autoscale.max_workers": 4,
         },
@@ -259,7 +259,7 @@ async def test_ownership_when_scale_in(ray_large_cluster):
             uid=AutoscalerActor.default_uid(),
             address=client._cluster.supervisor_address,
         )
-        num_chunks, chunk_size = 20, 4
+        num_chunks, chunk_size = 10, 4
         df = md.DataFrame(
             mt.random.rand(num_chunks * chunk_size, 4, chunk_size=chunk_size),
             columns=list("abcd"),
diff --git a/mars/services/scheduling/supervisor/autoscale.py b/mars/services/scheduling/supervisor/autoscale.py
@@ -82,7 +82,7 @@ async def request_worker(
         )
         if worker_address:
             self._dynamic_workers.add(worker_address)
-            logger.info(
+            logger.warning(
                 "Requested new worker %s in %.4f seconds, current dynamic worker nums is %s",
                 worker_address,
                 time.time() - start_time,

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ async def request_worker(`
`82`	`82`	`)`
`83`	`83`	`if worker_address:`
`84`	`84`	`self._dynamic_workers.add(worker_address)`
`85`		`- logger.info(`
	`85`	`+ logger.warning(`
`86`	`86`	`"Requested new worker %s in %.4f seconds, current dynamic worker nums is %s",`
`87`	`87`	`worker_address,`
`88`	`88`	`time.time() - start_time,`