Skip to content

Commit 9cfcc94

Browse files
authored
Basic Ray execution backend (#2921)
1 parent 11dc135 commit 9cfcc94

File tree

30 files changed

+1295
-376
lines changed

30 files changed

+1295
-376
lines changed

mars/conftest.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ def ray_start_regular_shared(request): # pragma: no cover
3535
yield from _ray_start_regular(request)
3636

3737

38+
@pytest.fixture(scope="module")
39+
def ray_start_regular_shared2(request): # pragma: no cover
40+
param = getattr(request, "param", {})
41+
num_cpus = param.get("num_cpus", 64)
42+
total_memory_mb = num_cpus * 2 * 1024**2
43+
try:
44+
try:
45+
job_config = ray.job_config.JobConfig(total_memory_mb=total_memory_mb)
46+
except TypeError:
47+
job_config = None
48+
yield ray.init(num_cpus=num_cpus, job_config=job_config)
49+
finally:
50+
ray.shutdown()
51+
52+
3853
@pytest.fixture
3954
def ray_start_regular(request): # pragma: no cover
4055
yield from _ray_start_regular(request)

mars/dataframe/contrib/raydataset/tests/test_mldataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ async def test_convert_to_ray_mldataset(
7676
ray_start_regular_shared, create_cluster, test_option
7777
):
7878
assert create_cluster.session
79-
session = new_session(address=create_cluster.address, backend="oscar", default=True)
79+
session = new_session(address=create_cluster.address, default=True)
8080
with session:
8181
value = np.random.rand(10, 10)
8282
chunk_size, num_shards = test_option
@@ -95,7 +95,7 @@ async def test_mars_with_xgboost(ray_start_regular_shared, create_cluster):
9595
from sklearn.datasets import load_breast_cancer
9696

9797
assert create_cluster.session
98-
session = new_session(address=create_cluster.address, backend="oscar", default=True)
98+
session = new_session(address=create_cluster.address, default=True)
9999
with session:
100100
train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
101101
df: md.DataFrame = md.concat(

mars/dataframe/contrib/raydataset/tests/test_raydataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ async def test_convert_to_ray_dataset(
5858
ray_start_regular_shared, create_cluster, test_option
5959
):
6060
assert create_cluster.session
61-
session = new_session(address=create_cluster.address, backend="oscar", default=True)
61+
session = new_session(address=create_cluster.address, default=True)
6262
with session:
6363
value = np.random.rand(10, 10)
6464
chunk_size, num_shards = test_option
@@ -77,7 +77,7 @@ async def test_mars_with_xgboost(ray_start_regular_shared, create_cluster):
7777
from sklearn.datasets import load_breast_cancer
7878

7979
assert create_cluster.session
80-
session = new_session(address=create_cluster.address, backend="oscar", default=True)
80+
session = new_session(address=create_cluster.address, default=True)
8181
with session:
8282
train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
8383
pd_df = pd.concat([train_x, train_y], axis=1)
@@ -119,7 +119,7 @@ async def test_mars_with_xgboost_sklearn_clf(ray_start_regular_shared, create_cl
119119
from sklearn.datasets import load_breast_cancer
120120

121121
assert create_cluster.session
122-
session = new_session(address=create_cluster.address, backend="oscar", default=True)
122+
session = new_session(address=create_cluster.address, default=True)
123123
with session:
124124
train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
125125
df: md.DataFrame = md.concat(
@@ -161,7 +161,7 @@ async def test_mars_with_xgboost_sklearn_reg(ray_start_regular_shared, create_cl
161161
from sklearn.datasets import make_regression
162162

163163
assert create_cluster.session
164-
session = new_session(address=create_cluster.address, backend="oscar", default=True)
164+
session = new_session(address=create_cluster.address, default=True)
165165
with session:
166166
np_X, np_y = make_regression(n_samples=1_0000, n_features=10)
167167
X, y = md.DataFrame(np_X), md.DataFrame({"target": np_y})

mars/deploy/oscar/local.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ async def new_cluster_in_isolation(
6060
timeout: float = None,
6161
n_supervisor_process: int = 0,
6262
) -> ClientType:
63-
if subprocess_start_method is None:
64-
subprocess_start_method = "spawn" if sys.platform == "win32" else "forkserver"
6563
cluster = LocalCluster(
6664
address,
6765
n_worker,
@@ -125,11 +123,15 @@ def __init__(
125123
subprocess_start_method: str = None,
126124
config: Union[str, Dict] = None,
127125
web: Union[bool, str] = "auto",
128-
timeout: float = None,
129126
n_supervisor_process: int = 0,
130127
):
131128
# load third party extensions.
132129
init_extension_entrypoints()
130+
# auto choose the subprocess_start_method.
131+
if subprocess_start_method is None:
132+
subprocess_start_method = (
133+
"spawn" if sys.platform == "win32" else "forkserver"
134+
)
133135
# load config file to dict.
134136
if not config or isinstance(config, str):
135137
config = load_config(config)
@@ -268,11 +270,22 @@ def __init__(self: ClientType, cluster: ClusterType, session: AbstractSession):
268270

269271
@classmethod
270272
async def create(
271-
cls, cluster: LocalCluster, backend: str = None, timeout: float = None
273+
cls,
274+
cluster: LocalCluster,
275+
backend: str = None,
276+
timeout: float = None,
272277
) -> ClientType:
273-
backend = backend or "oscar"
278+
if backend is None:
279+
backend = (
280+
cluster._config.get("task", {})
281+
.get("task_executor_config", {})
282+
.get("backend", "mars")
283+
)
274284
session = await _new_session(
275-
cluster.external_address, backend=backend, default=True, timeout=timeout
285+
cluster.external_address,
286+
backend=backend,
287+
default=True,
288+
timeout=timeout,
276289
)
277290
client = LocalClient(cluster, session)
278291
session.client = client

mars/deploy/oscar/ray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ def new_ray_session(
389389
session_id = session_id or client.session.session_id
390390
address = client.address
391391
session = new_session(
392-
address=address, session_id=session_id, backend="oscar", default=default
392+
address=address, session_id=session_id, backend="mars", default=default
393393
)
394394
session._ray_client = client
395395
if default:

mars/deploy/oscar/session.py

Lines changed: 45 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from numbers import Integral
3030
from urllib.parse import urlparse
3131
from weakref import ref, WeakKeyDictionary, WeakSet
32-
from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Type, Union
32+
from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Union
3333

3434
import numpy as np
3535

@@ -54,6 +54,7 @@
5454
from ...services.mutable import MutableAPI, MutableTensor
5555
from ...services.storage import StorageAPI
5656
from ...services.task import AbstractTaskAPI, TaskAPI, TaskResult
57+
from ...services.task.execution.api import Fetcher
5758
from ...services.web import OscarWebAPI
5859
from ...tensor.utils import slice_split
5960
from ...typing import ClientType, BandType
@@ -441,7 +442,7 @@ def init(
441442
cls,
442443
address: str,
443444
session_id: str,
444-
backend: str = "oscar",
445+
backend: str = "mars",
445446
new: bool = True,
446447
**kwargs,
447448
) -> "AbstractSession":
@@ -658,14 +659,6 @@ def fetch_log(
658659
return fetch(tileables, self, offsets=offsets, sizes=sizes)
659660

660661

661-
_type_name_to_session_cls: Dict[str, Type[AbstractAsyncSession]] = dict()
662-
663-
664-
def register_session_cls(session_cls: Type[AbstractAsyncSession]):
665-
_type_name_to_session_cls[session_cls.name] = session_cls
666-
return session_cls
667-
668-
669662
@dataclass
670663
class ChunkFetchInfo:
671664
tileable: TileableType
@@ -755,14 +748,12 @@ def gen_submit_tileable_graph(
755748
return graph, to_execute_tileables
756749

757750

758-
@register_session_cls
759751
class _IsolatedSession(AbstractAsyncSession):
760-
name = "oscar"
761-
762752
def __init__(
763753
self,
764754
address: str,
765755
session_id: str,
756+
backend: str,
766757
session_api: AbstractSessionAPI,
767758
meta_api: AbstractMetaAPI,
768759
lifecycle_api: AbstractLifecycleAPI,
@@ -775,6 +766,7 @@ def __init__(
775766
request_rewriter: Callable = None,
776767
):
777768
super().__init__(address, session_id)
769+
self._backend = backend
778770
self._session_api = session_api
779771
self._task_api = task_api
780772
self._meta_api = meta_api
@@ -800,7 +792,12 @@ def __init__(
800792

801793
@classmethod
802794
async def _init(
803-
cls, address: str, session_id: str, new: bool = True, timeout: float = None
795+
cls,
796+
address: str,
797+
session_id: str,
798+
backend: str,
799+
new: bool = True,
800+
timeout: float = None,
804801
):
805802
session_api = await SessionAPI.create(address)
806803
if new:
@@ -820,6 +817,7 @@ async def _init(
820817
return cls(
821818
address,
822819
session_id,
820+
backend,
823821
session_api,
824822
meta_api,
825823
lifecycle_api,
@@ -836,6 +834,7 @@ async def init(
836834
cls,
837835
address: str,
838836
session_id: str,
837+
backend: str,
839838
new: bool = True,
840839
timeout: float = None,
841840
**kwargs,
@@ -859,12 +858,19 @@ async def init(
859858
return await _IsolatedWebSession._init(
860859
address,
861860
session_id,
861+
backend,
862862
new=new,
863863
timeout=timeout,
864864
request_rewriter=request_rewriter,
865865
)
866866
else:
867-
return await cls._init(address, session_id, new=new, timeout=timeout)
867+
return await cls._init(
868+
address,
869+
session_id,
870+
backend,
871+
new=new,
872+
timeout=timeout,
873+
)
868874

869875
async def _update_progress(self, task_id: str, progress: Progress):
870876
zero_acc_time = 0
@@ -1084,6 +1090,8 @@ async def fetch(self, *tileables, **kwargs) -> list:
10841090
unexpected_keys = ", ".join(list(kwargs.keys()))
10851091
raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}")
10861092

1093+
fetcher = Fetcher.create(self._backend, get_storage_api=self._get_storage_api)
1094+
10871095
with enter_mode(build=True):
10881096
chunks = []
10891097
get_chunk_metas = []
@@ -1099,7 +1107,10 @@ async def fetch(self, *tileables, **kwargs) -> list:
10991107
continue
11001108
chunks.append(chunk)
11011109
get_chunk_metas.append(
1102-
self._meta_api.get_chunk_meta.delay(chunk.key, fields=["bands"])
1110+
self._meta_api.get_chunk_meta.delay(
1111+
chunk.key,
1112+
fields=fetcher.required_meta_keys,
1113+
)
11031114
)
11041115
indexes = (
11051116
chunk_to_slice[chunk] if chunk_to_slice is not None else None
@@ -1108,29 +1119,17 @@ async def fetch(self, *tileables, **kwargs) -> list:
11081119
ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=indexes)
11091120
)
11101121
fetch_infos_list.append(fetch_infos)
1111-
chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas)
1112-
chunk_to_band = {
1113-
chunk: meta["bands"][0] for chunk, meta in zip(chunks, chunk_metas)
1114-
}
11151122

1116-
storage_api_to_gets = defaultdict(list)
1117-
storage_api_to_fetch_infos = defaultdict(list)
1118-
for fetch_info in itertools.chain(*fetch_infos_list):
1119-
conditions = fetch_info.indexes
1120-
chunk = fetch_info.chunk
1121-
band = chunk_to_band[chunk]
1122-
storage_api = await self._get_storage_api(band)
1123-
storage_api_to_gets[storage_api].append(
1124-
storage_api.get.delay(chunk.key, conditions=conditions)
1125-
)
1126-
storage_api_to_fetch_infos[storage_api].append(fetch_info)
1127-
for storage_api in storage_api_to_gets:
1128-
fetched_data = await storage_api.get.batch(
1129-
*storage_api_to_gets[storage_api]
1130-
)
1131-
infos = storage_api_to_fetch_infos[storage_api]
1132-
for info, data in zip(infos, fetched_data):
1133-
info.data = data
1123+
chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas)
1124+
for chunk, meta, fetch_info in zip(
1125+
chunks, chunk_metas, itertools.chain(*fetch_infos_list)
1126+
):
1127+
await fetcher.append(chunk.key, meta, fetch_info.indexes)
1128+
fetched_data = await fetcher.get()
1129+
for fetch_info, data in zip(
1130+
itertools.chain(*fetch_infos_list), fetched_data
1131+
):
1132+
fetch_info.data = data
11341133

11351134
result = []
11361135
for tileable, fetch_infos in zip(tileables, fetch_infos_list):
@@ -1317,6 +1316,7 @@ async def _init(
13171316
cls,
13181317
address: str,
13191318
session_id: str,
1319+
backend: str,
13201320
new: bool = True,
13211321
timeout: float = None,
13221322
request_rewriter: Callable = None,
@@ -1341,6 +1341,7 @@ async def _init(
13411341
return cls(
13421342
address,
13431343
session_id,
1344+
backend,
13441345
session_api,
13451346
meta_api,
13461347
lifecycle_api,
@@ -1415,13 +1416,12 @@ async def init(
14151416
cls,
14161417
address: str,
14171418
session_id: str,
1418-
backend: str = "oscar",
1419+
backend: str = "mars",
14191420
new: bool = True,
14201421
**kwargs,
14211422
) -> "AbstractSession":
1422-
session_cls = _type_name_to_session_cls[backend]
14231423
isolation = ensure_isolation_created(kwargs)
1424-
coro = session_cls.init(address, session_id, new=new, **kwargs)
1424+
coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs)
14251425
fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
14261426
isolated_session = await asyncio.wrap_future(fut)
14271427
return AsyncSession(address, session_id, isolated_session, isolation)
@@ -1587,13 +1587,12 @@ def init(
15871587
cls,
15881588
address: str,
15891589
session_id: str,
1590-
backend: str = "oscar",
1590+
backend: str = "mars",
15911591
new: bool = True,
15921592
**kwargs,
15931593
) -> "AbstractSession":
1594-
session_cls = _type_name_to_session_cls[backend]
15951594
isolation = ensure_isolation_created(kwargs)
1596-
coro = session_cls.init(address, session_id, new=new, **kwargs)
1595+
coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs)
15971596
fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
15981597
isolated_session = fut.result()
15991598
return SyncSession(address, session_id, isolated_session, isolation)
@@ -1963,7 +1962,7 @@ def _new_session_id():
19631962
async def _new_session(
19641963
address: str,
19651964
session_id: str = None,
1966-
backend: str = "oscar",
1965+
backend: str = "mars",
19671966
default: bool = False,
19681967
**kwargs,
19691968
) -> AbstractSession:
@@ -1981,7 +1980,7 @@ async def _new_session(
19811980
def new_session(
19821981
address: str = None,
19831982
session_id: str = None,
1984-
backend: str = "oscar",
1983+
backend: str = "mars",
19851984
default: bool = True,
19861985
new: bool = True,
19871986
**kwargs,
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"@inherits": '@default'
2+
session:
3+
custom_log_dir: auto
4+
plasma:
5+
store_memory: 32M
6+
scheduling:
7+
mem_hard_limit: 0
8+
task:
9+
task_executor_config:
10+
"backend": "ray"

0 commit comments

Comments
 (0)