mars-project
diff --git a/‎.github/workflows/platform-ci.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/platform-ci.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎README.rst‎
Lines changed: 11 additions & 9 deletions b/‎README.rst‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎mars/conftest.py‎
Lines changed: 27 additions & 2 deletions b/‎mars/conftest.py‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎mars/dataframe/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎mars/dataframe/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mars/dataframe/contrib/raydataset/dataset.py‎
Lines changed: 23 additions & 5 deletions b/‎mars/dataframe/contrib/raydataset/dataset.py‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎mars/dataframe/contrib/raydataset/mldataset.py‎
Lines changed: 38 additions & 37 deletions b/‎mars/dataframe/contrib/raydataset/mldataset.py‎
Lines changed: 38 additions & 37 deletions
diff --git a/‎mars/dataframe/contrib/raydataset/tests/test_mldataset.py‎
Lines changed: 16 additions & 9 deletions b/‎mars/dataframe/contrib/raydataset/tests/test_mldataset.py‎
Lines changed: 16 additions & 9 deletions
@@ -83,10 +83,8 @@ jobs:
               rm -fr /tmp/etcd-$ETCD_VER-linux-amd64.tar.gz /tmp/etcd-download-test
             fi
             if [ -n "$WITH_RAY" ]; then
-              # Change back to `pip install ray[default]` when ray-1.7.0 is released.
-              pip install https://s3-us-west-2.amazonaws.com/ray-wheels/master/aee7ba2510dd0eeed8f84dba3e9c5d58cb97d49a/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
-              # Change back to `pip install xgboost_ray` when it doesn't install older version of numpy
-              pip install git+https://github.com/ray-project/xgboost_ray.git#xgboost_ray
+              pip install pip install ray[default]
+              pip install xgboost_ray
               pip install --upgrade numpy
             fi
             if [ -n "$RUN_DASK" ]; then
 
@@ -235,16 +235,9 @@ by default session once it is created.
     dtype: int64
 
 
-Easy to scale in and scale out
-------------------------------
-
-Mars can scale in to a single machine, and scale out to a cluster with thousands of machines.
-It's fairly simple to migrate from a single machine to a cluster to
-process more data or gain a better performance.
-
 Mars on Ray
 ------------
-Mars also has deep integration with Ray and can run on `Ray <https://docs.ray.io/en/latest/>` efficiently and
+Mars also has deep integration with Ray and can run on `Ray <https://docs.ray.io/en/latest/>`_ efficiently and
 interact with the large ecosystem of machine learning and distributed systems built on top of the core Ray.
 
 Starting a new Mars on Ray runtime locally via:
@@ -258,7 +251,7 @@ Starting a new Mars on Ray runtime locally via:
     import mars.tensor as mt
     mt.random.RandomState(0).rand(1000_0000, 5).sum().execute()
 
-Or connecting to a Mars on Ray cluster which is already initialized.
+Or connecting to a Mars on Ray runtime which is already initialized.
 
 .. code-block:: python
 
@@ -285,6 +278,15 @@ Interact with Ray Dataset:
 
 Refer to `Mars on Ray`_ for more information.
 
+
+Easy to scale in and scale out
+------------------------------
+
+Mars can scale in to a single machine, and scale out to a cluster with thousands of machines.
+It's fairly simple to migrate from a single machine to a cluster to
+process more data or gain a better performance.
+
+
 Bare Metal Deployment
 `````````````````````
 
 
@@ -50,15 +50,17 @@ def ray_start_regular(request):
 def ray_large_cluster(request):  # pragma: no cover
     param = getattr(request, "param", {})
     num_nodes = param.get("num_nodes", 3)
-    num_cpus = param.get("num_cpus", 10)
+    num_cpus = param.get("num_cpus", 16)
     try:
         from ray.cluster_utils import Cluster
     except ModuleNotFoundError:
         from ray._private.cluster_utils import Cluster
     cluster = Cluster()
     remote_nodes = []
     for i in range(num_nodes):
-        remote_nodes.append(cluster.add_node(num_cpus=num_cpus))
+        remote_nodes.append(
+            cluster.add_node(num_cpus=num_cpus, memory=num_cpus * 2 * 1024 ** 3)
+        )
         if len(remote_nodes) == 1:
             ray.init(address=cluster.address)
     register_ray_serializers()
@@ -82,6 +84,29 @@ def stop_ray(request):  # pragma: no cover
         ray.shutdown()
 
 
+@pytest.fixture
+async def ray_create_mars_cluster(request):
+    from mars.deploy.oscar.ray import new_cluster, _load_config
+
+    ray_config = _load_config()
+    param = getattr(request, "param", {})
+    supervisor_mem = param.get("supervisor_mem", 1 * 1024 ** 3)
+    worker_num = param.get("worker_num", 2)
+    worker_cpu = param.get("worker_cpu", 2)
+    worker_mem = param.get("worker_mem", 256 * 1024 ** 2)
+    ray_config.update(param.get("config", {}))
+    client = await new_cluster(
+        "test_cluster",
+        supervisor_mem=supervisor_mem,
+        worker_num=worker_num,
+        worker_cpu=worker_cpu,
+        worker_mem=worker_mem,
+        config=ray_config,
+    )
+    async with client:
+        yield client
+
+
 @pytest.fixture(scope="module")
 def _stop_isolation():
     yield
 
@@ -23,14 +23,15 @@
 from .base.melt import melt
 from .base.qcut import qcut
 from .base.to_numeric import to_numeric
+from .contrib.raydataset import to_ray_mldataset, to_ray_dataset
 from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor
 from .datasource.from_index import series_from_index
 from .datasource.from_records import from_records
 from .datasource.from_vineyard import from_vineyard
 from .datasource.read_csv import read_csv
 from .datasource.read_sql import read_sql, read_sql_table, read_sql_query
 from .datasource.read_parquet import read_parquet
-from .datasource.read_raydataset import read_raydataset
+from .datasource.read_raydataset import read_raydataset, read_ray_mldataset
 from .datasource.date_range import date_range
 from .fetch import DataFrameFetch, DataFrameFetchShuffle
 from .merge import concat, merge
 
@@ -20,7 +20,28 @@
 # Ray Datasets is available in early preview at ray.data with Ray 1.6+
 # (and ray.experimental.data in Ray 1.5)
 ray_dataset = lazy_import("ray.data")
-ray_exp_dataset = lazy_import("ray.experimental.data")
+
+
+if ray:
+    import ray.data.dataset
+
+    class _Dataset(ray_dataset.Dataset):
+        def __init__(self, mars_dataframe, blocks):
+            super().__init__(blocks, 0)
+            # Hold mars dataframe to avoid mars dataframe and ray object gc.
+            # TODO(mubai) Use a separate operator for rechunk and avoiding gc.
+            self.dataframe = mars_dataframe
+
+        def __getstate__(self):
+            state = self.__dict__.copy()
+            state.pop("dataframe", None)
+            return state
+
+        # The default __setstate__ will update _MLDataset's __dict__;
+
+
+else:
+    _Dataset = None
 
 
 def to_ray_dataset(df, num_shards: int = None):
@@ -45,10 +66,7 @@ def to_ray_dataset(df, num_shards: int = None):
     #       chunk2 & chunk3 for addr2,
     #       chunk4 for addr1
     chunk_refs: List["ray.ObjectRef"] = get_chunk_refs(df)
-    # Ray Datasets is available in early preview at ray.data with Ray 1.6+
-    # (and ray.experimental.data in Ray 1.5)
-    real_ray_dataset = ray_dataset or ray_exp_dataset
-    return real_ray_dataset.from_pandas(chunk_refs)
+    return _Dataset(df, ray_dataset.from_pandas_refs(chunk_refs)._blocks)
 
 
 def get_chunk_refs(df):
 
@@ -17,7 +17,7 @@
 from collections import defaultdict
 from typing import Dict, Iterable, List, Tuple
 
-from ....utils import ceildiv, lazy_import
+from ....utils import lazy_import
 
 ray = lazy_import("ray")
 parallel_it = lazy_import("ray.util.iter")
@@ -71,43 +71,37 @@ def _group_chunk_refs(
     return group_to_obj_refs
 
 
-def _create_ml_dataset(name: str, group_to_obj_refs: Dict[str, List["ray.ObjectRef"]]):
-    record_batches = []
-    for rank, obj_refs in enumerate(group_to_obj_refs.values()):
-        record_batches.append(ChunkRefBatch(shard_id=rank, obj_refs=obj_refs))
-    worker_cls = ray.remote(num_cpus=0)(parallel_it.ParallelIteratorWorker)
-    actors = [worker_cls.remote(g, False) for g in record_batches]
-    it = parallel_it.from_actors(actors, name)
-    ds = ml_dataset.from_parallel_iter(
-        it, need_convert=False, batch_size=0, repeated=False
-    )
-    return ds
+def _rechunk_if_needed(df, num_shards: int = None):
+    try:
+        if num_shards:
+            assert isinstance(num_shards, int) and num_shards > 0
+            df = df.rebalance(axis=0, num_partitions=num_shards)
+        df = df.rechunk({1: df.shape[1]})
+        df = df.reset_index(drop=True)
+        return df.execute()
+    except Exception as e:  # pragma: no cover
+        raise Exception(f"rechunk failed df.shape {df.shape}") from e
 
 
-def _rechunk_if_needed(df, num_shards: int = None):
-    chunk_size = df.extra_params.raw_chunk_size or max(df.shape)
-    num_rows = df.shape[0]
-    num_columns = df.shape[1]
-    # if chunk size not set, num_chunks_in_row = 1
-    # if chunk size is set more than max(df.shape), num_chunks_in_row = 1
-    # otherwise, num_chunks_in_row depends on ceildiv(num_rows, chunk_size)
-    num_chunks_in_row = ceildiv(num_rows, chunk_size)
-    naive_num_partitions = ceildiv(num_rows, num_columns)
-
-    need_re_execute = False
-    # ensure each part holds all columns
-    if chunk_size < num_columns:
-        df = df.rebalance(axis=1, num_partitions=1)
-        need_re_execute = True
-    if num_shards and num_chunks_in_row < num_shards:
-        df = df.rebalance(axis=0, num_partitions=num_shards)
-        need_re_execute = True
-    if not num_shards and num_chunks_in_row == 1:
-        df = df.rebalance(axis=0, num_partitions=naive_num_partitions)
-        need_re_execute = True
-    if need_re_execute:
-        df.execute()
-    return df
+if ray:
+
+    class _MLDataset(ml_dataset.MLDataset):
+        def __init__(self, mars_dataframe, actor_sets, name: str, parent_iterators):
+            super().__init__(actor_sets, name, parent_iterators, 0, False)
+            # Hold mars dataframe to avoid mars dataframe and ray object gc.
+            # TODO(mubai) Use a separate operator for rechunk and avoiding gc.
+            self._mars_dataframe = mars_dataframe
+
+        def __getstate__(self):
+            state = self.__dict__.copy()
+            state.pop("_mars_dataframe", None)
+            return state
+
+        # The default __setstate__ will update _MLDataset's __dict__;
+
+
+else:
+    _MLDataset = None
 
 
 def to_ray_mldataset(df, num_shards: int = None):
@@ -139,4 +133,11 @@ def to_ray_mldataset(df, num_shards: int = None):
     group_to_obj_refs: Dict[str, List[ray.ObjectRef]] = _group_chunk_refs(
         chunk_addr_refs, num_shards
     )
-    return _create_ml_dataset("from_mars", group_to_obj_refs)
+
+    record_batches = []
+    for rank, obj_refs in enumerate(group_to_obj_refs.values()):
+        record_batches.append(ChunkRefBatch(shard_id=rank, obj_refs=obj_refs))
+    worker_cls = ray.remote(num_cpus=0)(parallel_it.ParallelIteratorWorker)
+    actors = [worker_cls.remote(g, False) for g in record_batches]
+    it = parallel_it.from_actors(actors, "from_mars")
+    return _MLDataset(df, it.actor_sets, it.name, it.parent_iterators)
@@ -18,7 +18,7 @@
 import pytest
 
 from ..... import dataframe as md
-from .....deploy.oscar.ray import new_cluster, _load_config
+from .....deploy.oscar.ray import new_cluster
 from .....deploy.oscar.session import new_session
 from .....tests.core import require_ray
 from .....utils import lazy_import
@@ -31,19 +31,20 @@
     import xgboost_ray
 except ImportError:  # pragma: no cover
     xgboost_ray = None
+try:
+    import sklearn
+except ImportError:  # pragma: no cover
+    sklearn = None
 
 
 @pytest.fixture
 async def create_cluster(request):
-    param = getattr(request, "param", {})
-    ray_config = _load_config()
-    ray_config.update(param.get("config", {}))
     client = await new_cluster(
         "test_cluster",
+        supervisor_mem=1 * 1024 ** 3,
         worker_num=4,
         worker_cpu=2,
         worker_mem=1 * 1024 ** 3,
-        config=ray_config,
     )
     async with client:
         yield client
@@ -89,21 +90,26 @@ async def test_convert_to_ray_mldataset(ray_large_cluster, create_cluster, test_
 @pytest.mark.asyncio
 @pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed")
 async def test_mars_with_xgboost(ray_large_cluster, create_cluster):
-    from xgboost_ray import RayDMatrix, RayParams, train
+    from xgboost_ray import RayDMatrix, RayParams, train, predict
     from sklearn.datasets import load_breast_cancer
 
     assert create_cluster.session
     session = new_session(address=create_cluster.address, backend="oscar", default=True)
     with session:
         train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
-        pd_df = pd.concat([train_x, train_y], axis=1)
-        df: md.DataFrame = md.DataFrame(pd_df)
+        df: md.DataFrame = md.concat(
+            [md.DataFrame(train_x), md.DataFrame(train_y)], axis=1
+        )
         df.execute()
 
         num_shards = 4
-        ds = mdd.to_ray_mldataset(df)
+        ds = mdd.to_ray_mldataset(df, num_shards)
         assert isinstance(ds, ml_dataset.MLDataset)
 
+        import gc
+
+        gc.collect()  # Ensure MLDataset does hold mars dataframe to avoid gc.
+
         # train
         train_set = RayDMatrix(ds, "target")
         evals_result = {}
@@ -124,3 +130,4 @@ async def test_mars_with_xgboost(ray_large_cluster, create_cluster):
         assert os.path.exists("model.xgb")
         os.remove("model.xgb")
         print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
+        predict(bst, train_set, ray_params=RayParams(num_actors=2))