Skip to content

Commit 7ad7e03

Browse files
authored
[Ray] Refine raydataset integration (#2579)
1 parent 35ffc16 commit 7ad7e03

File tree

12 files changed

+403
-90
lines changed

12 files changed

+403
-90
lines changed

.github/workflows/platform-ci.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,8 @@ jobs:
8383
rm -fr /tmp/etcd-$ETCD_VER-linux-amd64.tar.gz /tmp/etcd-download-test
8484
fi
8585
if [ -n "$WITH_RAY" ]; then
86-
# Change back to `pip install ray[default]` when ray-1.7.0 is released.
87-
pip install https://s3-us-west-2.amazonaws.com/ray-wheels/master/aee7ba2510dd0eeed8f84dba3e9c5d58cb97d49a/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
88-
# Change back to `pip install xgboost_ray` when it doesn't install older version of numpy
89-
pip install git+https://github.com/ray-project/xgboost_ray.git#xgboost_ray
86+
pip install pip install ray[default]
87+
pip install xgboost_ray
9088
pip install --upgrade numpy
9189
fi
9290
if [ -n "$RUN_DASK" ]; then

README.rst

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -235,16 +235,9 @@ by default session once it is created.
235235
dtype: int64
236236
237237
238-
Easy to scale in and scale out
239-
------------------------------
240-
241-
Mars can scale in to a single machine, and scale out to a cluster with thousands of machines.
242-
It's fairly simple to migrate from a single machine to a cluster to
243-
process more data or gain a better performance.
244-
245238
Mars on Ray
246239
------------
247-
Mars also has deep integration with Ray and can run on `Ray <https://docs.ray.io/en/latest/>` efficiently and
240+
Mars also has deep integration with Ray and can run on `Ray <https://docs.ray.io/en/latest/>`_ efficiently and
248241
interact with the large ecosystem of machine learning and distributed systems built on top of the core Ray.
249242

250243
Starting a new Mars on Ray runtime locally via:
@@ -258,7 +251,7 @@ Starting a new Mars on Ray runtime locally via:
258251
import mars.tensor as mt
259252
mt.random.RandomState(0).rand(1000_0000, 5).sum().execute()
260253
261-
Or connecting to a Mars on Ray cluster which is already initialized.
254+
Or connecting to a Mars on Ray runtime which is already initialized.
262255

263256
.. code-block:: python
264257
@@ -285,6 +278,15 @@ Interact with Ray Dataset:
285278
286279
Refer to `Mars on Ray`_ for more information.
287280

281+
282+
Easy to scale in and scale out
283+
------------------------------
284+
285+
Mars can scale in to a single machine, and scale out to a cluster with thousands of machines.
286+
It's fairly simple to migrate from a single machine to a cluster to
287+
process more data or gain a better performance.
288+
289+
288290
Bare Metal Deployment
289291
`````````````````````
290292

mars/conftest.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,17 @@ def ray_start_regular(request):
5050
def ray_large_cluster(request): # pragma: no cover
5151
param = getattr(request, "param", {})
5252
num_nodes = param.get("num_nodes", 3)
53-
num_cpus = param.get("num_cpus", 10)
53+
num_cpus = param.get("num_cpus", 16)
5454
try:
5555
from ray.cluster_utils import Cluster
5656
except ModuleNotFoundError:
5757
from ray._private.cluster_utils import Cluster
5858
cluster = Cluster()
5959
remote_nodes = []
6060
for i in range(num_nodes):
61-
remote_nodes.append(cluster.add_node(num_cpus=num_cpus))
61+
remote_nodes.append(
62+
cluster.add_node(num_cpus=num_cpus, memory=num_cpus * 2 * 1024 ** 3)
63+
)
6264
if len(remote_nodes) == 1:
6365
ray.init(address=cluster.address)
6466
register_ray_serializers()
@@ -82,6 +84,29 @@ def stop_ray(request): # pragma: no cover
8284
ray.shutdown()
8385

8486

87+
@pytest.fixture
88+
async def ray_create_mars_cluster(request):
89+
from mars.deploy.oscar.ray import new_cluster, _load_config
90+
91+
ray_config = _load_config()
92+
param = getattr(request, "param", {})
93+
supervisor_mem = param.get("supervisor_mem", 1 * 1024 ** 3)
94+
worker_num = param.get("worker_num", 2)
95+
worker_cpu = param.get("worker_cpu", 2)
96+
worker_mem = param.get("worker_mem", 256 * 1024 ** 2)
97+
ray_config.update(param.get("config", {}))
98+
client = await new_cluster(
99+
"test_cluster",
100+
supervisor_mem=supervisor_mem,
101+
worker_num=worker_num,
102+
worker_cpu=worker_cpu,
103+
worker_mem=worker_mem,
104+
config=ray_config,
105+
)
106+
async with client:
107+
yield client
108+
109+
85110
@pytest.fixture(scope="module")
86111
def _stop_isolation():
87112
yield

mars/dataframe/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@
2323
from .base.melt import melt
2424
from .base.qcut import qcut
2525
from .base.to_numeric import to_numeric
26+
from .contrib.raydataset import to_ray_mldataset, to_ray_dataset
2627
from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor
2728
from .datasource.from_index import series_from_index
2829
from .datasource.from_records import from_records
2930
from .datasource.from_vineyard import from_vineyard
3031
from .datasource.read_csv import read_csv
3132
from .datasource.read_sql import read_sql, read_sql_table, read_sql_query
3233
from .datasource.read_parquet import read_parquet
33-
from .datasource.read_raydataset import read_raydataset
34+
from .datasource.read_raydataset import read_raydataset, read_ray_mldataset
3435
from .datasource.date_range import date_range
3536
from .fetch import DataFrameFetch, DataFrameFetchShuffle
3637
from .merge import concat, merge

mars/dataframe/contrib/raydataset/dataset.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,28 @@
2020
# Ray Datasets is available in early preview at ray.data with Ray 1.6+
2121
# (and ray.experimental.data in Ray 1.5)
2222
ray_dataset = lazy_import("ray.data")
23-
ray_exp_dataset = lazy_import("ray.experimental.data")
23+
24+
25+
if ray:
26+
import ray.data.dataset
27+
28+
class _Dataset(ray_dataset.Dataset):
29+
def __init__(self, mars_dataframe, blocks):
30+
super().__init__(blocks, 0)
31+
# Hold mars dataframe to avoid mars dataframe and ray object gc.
32+
# TODO(mubai) Use a separate operator for rechunk and avoiding gc.
33+
self.dataframe = mars_dataframe
34+
35+
def __getstate__(self):
36+
state = self.__dict__.copy()
37+
state.pop("dataframe", None)
38+
return state
39+
40+
# The default __setstate__ will update _MLDataset's __dict__;
41+
42+
43+
else:
44+
_Dataset = None
2445

2546

2647
def to_ray_dataset(df, num_shards: int = None):
@@ -45,10 +66,7 @@ def to_ray_dataset(df, num_shards: int = None):
4566
# chunk2 & chunk3 for addr2,
4667
# chunk4 for addr1
4768
chunk_refs: List["ray.ObjectRef"] = get_chunk_refs(df)
48-
# Ray Datasets is available in early preview at ray.data with Ray 1.6+
49-
# (and ray.experimental.data in Ray 1.5)
50-
real_ray_dataset = ray_dataset or ray_exp_dataset
51-
return real_ray_dataset.from_pandas(chunk_refs)
69+
return _Dataset(df, ray_dataset.from_pandas_refs(chunk_refs)._blocks)
5270

5371

5472
def get_chunk_refs(df):

mars/dataframe/contrib/raydataset/mldataset.py

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from collections import defaultdict
1818
from typing import Dict, Iterable, List, Tuple
1919

20-
from ....utils import ceildiv, lazy_import
20+
from ....utils import lazy_import
2121

2222
ray = lazy_import("ray")
2323
parallel_it = lazy_import("ray.util.iter")
@@ -71,43 +71,37 @@ def _group_chunk_refs(
7171
return group_to_obj_refs
7272

7373

74-
def _create_ml_dataset(name: str, group_to_obj_refs: Dict[str, List["ray.ObjectRef"]]):
75-
record_batches = []
76-
for rank, obj_refs in enumerate(group_to_obj_refs.values()):
77-
record_batches.append(ChunkRefBatch(shard_id=rank, obj_refs=obj_refs))
78-
worker_cls = ray.remote(num_cpus=0)(parallel_it.ParallelIteratorWorker)
79-
actors = [worker_cls.remote(g, False) for g in record_batches]
80-
it = parallel_it.from_actors(actors, name)
81-
ds = ml_dataset.from_parallel_iter(
82-
it, need_convert=False, batch_size=0, repeated=False
83-
)
84-
return ds
74+
def _rechunk_if_needed(df, num_shards: int = None):
75+
try:
76+
if num_shards:
77+
assert isinstance(num_shards, int) and num_shards > 0
78+
df = df.rebalance(axis=0, num_partitions=num_shards)
79+
df = df.rechunk({1: df.shape[1]})
80+
df = df.reset_index(drop=True)
81+
return df.execute()
82+
except Exception as e: # pragma: no cover
83+
raise Exception(f"rechunk failed df.shape {df.shape}") from e
8584

8685

87-
def _rechunk_if_needed(df, num_shards: int = None):
88-
chunk_size = df.extra_params.raw_chunk_size or max(df.shape)
89-
num_rows = df.shape[0]
90-
num_columns = df.shape[1]
91-
# if chunk size not set, num_chunks_in_row = 1
92-
# if chunk size is set more than max(df.shape), num_chunks_in_row = 1
93-
# otherwise, num_chunks_in_row depends on ceildiv(num_rows, chunk_size)
94-
num_chunks_in_row = ceildiv(num_rows, chunk_size)
95-
naive_num_partitions = ceildiv(num_rows, num_columns)
96-
97-
need_re_execute = False
98-
# ensure each part holds all columns
99-
if chunk_size < num_columns:
100-
df = df.rebalance(axis=1, num_partitions=1)
101-
need_re_execute = True
102-
if num_shards and num_chunks_in_row < num_shards:
103-
df = df.rebalance(axis=0, num_partitions=num_shards)
104-
need_re_execute = True
105-
if not num_shards and num_chunks_in_row == 1:
106-
df = df.rebalance(axis=0, num_partitions=naive_num_partitions)
107-
need_re_execute = True
108-
if need_re_execute:
109-
df.execute()
110-
return df
86+
if ray:
87+
88+
class _MLDataset(ml_dataset.MLDataset):
89+
def __init__(self, mars_dataframe, actor_sets, name: str, parent_iterators):
90+
super().__init__(actor_sets, name, parent_iterators, 0, False)
91+
# Hold mars dataframe to avoid mars dataframe and ray object gc.
92+
# TODO(mubai) Use a separate operator for rechunk and avoiding gc.
93+
self._mars_dataframe = mars_dataframe
94+
95+
def __getstate__(self):
96+
state = self.__dict__.copy()
97+
state.pop("_mars_dataframe", None)
98+
return state
99+
100+
# The default __setstate__ will update _MLDataset's __dict__;
101+
102+
103+
else:
104+
_MLDataset = None
111105

112106

113107
def to_ray_mldataset(df, num_shards: int = None):
@@ -139,4 +133,11 @@ def to_ray_mldataset(df, num_shards: int = None):
139133
group_to_obj_refs: Dict[str, List[ray.ObjectRef]] = _group_chunk_refs(
140134
chunk_addr_refs, num_shards
141135
)
142-
return _create_ml_dataset("from_mars", group_to_obj_refs)
136+
137+
record_batches = []
138+
for rank, obj_refs in enumerate(group_to_obj_refs.values()):
139+
record_batches.append(ChunkRefBatch(shard_id=rank, obj_refs=obj_refs))
140+
worker_cls = ray.remote(num_cpus=0)(parallel_it.ParallelIteratorWorker)
141+
actors = [worker_cls.remote(g, False) for g in record_batches]
142+
it = parallel_it.from_actors(actors, "from_mars")
143+
return _MLDataset(df, it.actor_sets, it.name, it.parent_iterators)

mars/dataframe/contrib/raydataset/tests/test_mldataset.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import pytest
1919

2020
from ..... import dataframe as md
21-
from .....deploy.oscar.ray import new_cluster, _load_config
21+
from .....deploy.oscar.ray import new_cluster
2222
from .....deploy.oscar.session import new_session
2323
from .....tests.core import require_ray
2424
from .....utils import lazy_import
@@ -31,19 +31,20 @@
3131
import xgboost_ray
3232
except ImportError: # pragma: no cover
3333
xgboost_ray = None
34+
try:
35+
import sklearn
36+
except ImportError: # pragma: no cover
37+
sklearn = None
3438

3539

3640
@pytest.fixture
3741
async def create_cluster(request):
38-
param = getattr(request, "param", {})
39-
ray_config = _load_config()
40-
ray_config.update(param.get("config", {}))
4142
client = await new_cluster(
4243
"test_cluster",
44+
supervisor_mem=1 * 1024 ** 3,
4345
worker_num=4,
4446
worker_cpu=2,
4547
worker_mem=1 * 1024 ** 3,
46-
config=ray_config,
4748
)
4849
async with client:
4950
yield client
@@ -89,21 +90,26 @@ async def test_convert_to_ray_mldataset(ray_large_cluster, create_cluster, test_
8990
@pytest.mark.asyncio
9091
@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed")
9192
async def test_mars_with_xgboost(ray_large_cluster, create_cluster):
92-
from xgboost_ray import RayDMatrix, RayParams, train
93+
from xgboost_ray import RayDMatrix, RayParams, train, predict
9394
from sklearn.datasets import load_breast_cancer
9495

9596
assert create_cluster.session
9697
session = new_session(address=create_cluster.address, backend="oscar", default=True)
9798
with session:
9899
train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
99-
pd_df = pd.concat([train_x, train_y], axis=1)
100-
df: md.DataFrame = md.DataFrame(pd_df)
100+
df: md.DataFrame = md.concat(
101+
[md.DataFrame(train_x), md.DataFrame(train_y)], axis=1
102+
)
101103
df.execute()
102104

103105
num_shards = 4
104-
ds = mdd.to_ray_mldataset(df)
106+
ds = mdd.to_ray_mldataset(df, num_shards)
105107
assert isinstance(ds, ml_dataset.MLDataset)
106108

109+
import gc
110+
111+
gc.collect() # Ensure MLDataset does hold mars dataframe to avoid gc.
112+
107113
# train
108114
train_set = RayDMatrix(ds, "target")
109115
evals_result = {}
@@ -124,3 +130,4 @@ async def test_mars_with_xgboost(ray_large_cluster, create_cluster):
124130
assert os.path.exists("model.xgb")
125131
os.remove("model.xgb")
126132
print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
133+
predict(bst, train_set, ray_params=RayParams(num_actors=2))

0 commit comments

Comments
 (0)