Skip to content

Commit b93c02b

Browse files
fyrestone刘宝
andauthored
Improve deploy backend (#2958)
* new_cluster support backend * Fix * Fix * Fix lint * Remove unused code Co-authored-by: 刘宝 <[email protected]>
1 parent 712d7f1 commit b93c02b

File tree

10 files changed

+82
-68
lines changed

10 files changed

+82
-68
lines changed

mars/deploy/oscar/base_config.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ task:
2929
fuse_enabled: yes
3030
initial_same_color_num: null
3131
as_broadcaster_successor_num: null
32+
task_executor_config:
33+
backend: mars
3234
scheduling:
3335
autoscale:
3436
enabled: false

mars/deploy/oscar/local.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414

1515
import asyncio
1616
import atexit
17-
import logging
17+
import os
1818
import sys
19+
import logging
1920
from concurrent.futures import Future as SyncFuture
2021
from typing import Dict, List, Union
2122

@@ -27,14 +28,13 @@
2728
from ...resource import cpu_count, cuda_count, mem_total, Resource
2829
from ...services import NodeRole
2930
from ...typing import ClusterType, ClientType
30-
from ..utils import get_third_party_modules_from_config
31+
from ..utils import get_third_party_modules_from_config, load_config
3132
from .pool import create_supervisor_actor_pool, create_worker_actor_pool
3233
from .service import (
3334
start_supervisor,
3435
start_worker,
3536
stop_supervisor,
3637
stop_worker,
37-
load_config,
3838
)
3939
from .session import AbstractSession, _new_session, ensure_isolation_created
4040

@@ -46,6 +46,15 @@
4646
)
4747
atexit.register(stop_isolation)
4848

49+
# The default config file.
50+
DEFAULT_CONFIG_FILE = os.path.join(
51+
os.path.dirname(os.path.abspath(__file__)), "config.yml"
52+
)
53+
54+
55+
def _load_config(config: Union[str, Dict] = None):
56+
return load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
57+
4958

5059
async def new_cluster_in_isolation(
5160
address: str = "0.0.0.0",
@@ -67,6 +76,7 @@ async def new_cluster_in_isolation(
6776
mem_bytes,
6877
cuda_devices,
6978
subprocess_start_method,
79+
backend,
7080
config,
7181
web,
7282
n_supervisor_process,
@@ -82,6 +92,7 @@ async def new_cluster(
8292
mem_bytes: Union[int, str] = "auto",
8393
cuda_devices: Union[List[int], str] = "auto",
8494
subprocess_start_method: str = None,
95+
backend: str = None,
8596
config: Union[str, Dict] = None,
8697
web: bool = True,
8798
loop: asyncio.AbstractEventLoop = None,
@@ -95,6 +106,7 @@ async def new_cluster(
95106
mem_bytes=mem_bytes,
96107
cuda_devices=cuda_devices,
97108
subprocess_start_method=subprocess_start_method,
109+
backend=backend,
98110
config=config,
99111
web=web,
100112
n_supervisor_process=n_supervisor_process,
@@ -121,6 +133,7 @@ def __init__(
121133
mem_bytes: Union[int, str] = "auto",
122134
cuda_devices: Union[List[int], List[List[int]], str] = "auto",
123135
subprocess_start_method: str = None,
136+
backend: str = None,
124137
config: Union[str, Dict] = None,
125138
web: Union[bool, str] = "auto",
126139
n_supervisor_process: int = 0,
@@ -133,11 +146,11 @@ def __init__(
133146
"spawn" if sys.platform == "win32" else "forkserver"
134147
)
135148
# load config file to dict.
136-
if not config or isinstance(config, str):
137-
config = load_config(config)
138149
self._address = address
139150
self._subprocess_start_method = subprocess_start_method
140-
self._config = config
151+
self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
152+
if backend is not None:
153+
self._config["task"]["task_executor_config"]["backend"] = backend
141154
self._n_cpu = cpu_count() if n_cpu == "auto" else n_cpu
142155
self._mem_bytes = mem_total() if mem_bytes == "auto" else mem_bytes
143156
self._n_supervisor_process = n_supervisor_process

mars/deploy/oscar/ray.py

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@
3636
AbstractClusterBackend,
3737
)
3838
from ...services import NodeRole
39-
from ...utils import merge_dict, flatten_dict_to_nested_dict
4039
from ...utils import lazy_import
41-
from ..utils import load_service_config_file, get_third_party_modules_from_config
40+
from ..utils import (
41+
load_config,
42+
get_third_party_modules_from_config,
43+
)
4244
from .service import start_supervisor, start_worker, stop_supervisor, stop_worker
4345
from .session import (
4446
_new_session,
@@ -51,31 +53,18 @@
5153
ray = lazy_import("ray")
5254
logger = logging.getLogger(__name__)
5355

56+
# The default config file.
57+
DEFAULT_CONFIG_FILE = os.path.join(
58+
os.path.dirname(os.path.abspath(__file__)), "rayconfig.yml"
59+
)
5460
# The default value for supervisor standalone (not share node with worker).
5561
DEFAULT_SUPERVISOR_STANDALONE = False
5662
# The default value for supervisor sub pool count.
5763
DEFAULT_SUPERVISOR_SUB_POOL_NUM = 0
5864

5965

6066
def _load_config(config: Union[str, Dict] = None):
61-
# use default config
62-
if isinstance(config, str):
63-
filename = config
64-
else:
65-
d = os.path.dirname(os.path.abspath(__file__))
66-
filename = os.path.join(d, "rayconfig.yml")
67-
full_config = load_service_config_file(filename)
68-
if config and not isinstance(config, str):
69-
if not isinstance(config, Dict): # pragma: no cover
70-
raise ValueError(f"{config} is not a dict")
71-
flatten_keys = set(k for k in config.keys() if isinstance(k, str) and "." in k)
72-
nested_flatten_config = flatten_dict_to_nested_dict(
73-
{k: config[k] for k in flatten_keys}
74-
)
75-
nested_config = {k: config[k] for k in config.keys() if k not in flatten_keys}
76-
config = merge_dict(nested_config, nested_flatten_config, overwrite=False)
77-
merge_dict(full_config, config)
78-
return full_config
67+
return load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
7968

8069

8170
@register_cluster_backend
@@ -421,7 +410,7 @@ def __init__(
421410
self._worker_cpu = worker_cpu
422411
self._worker_mem = worker_mem
423412
# load config file to dict.
424-
self._config = _load_config(config)
413+
self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
425414
self.supervisor_address = None
426415
# Hold actor handles to avoid being freed
427416
self._supervisor_pool = None

mars/deploy/oscar/service.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,14 @@
1313
# limitations under the License.
1414

1515
import logging
16-
import os
1716
from typing import List, Dict, Union
1817

1918
from ...resource import Resource
2019
from ...services import start_services, stop_services, NodeRole
21-
from ..utils import load_service_config_file
2220

2321
logger = logging.getLogger(__name__)
2422

2523

26-
def load_config(filename=None):
27-
# use default config
28-
if not filename: # pragma: no cover
29-
d = os.path.dirname(os.path.abspath(__file__))
30-
filename = os.path.join(d, "config.yml")
31-
return load_service_config_file(filename)
32-
33-
3424
async def start_supervisor(
3525
address: str,
3626
lookup_address: str = None,
@@ -39,8 +29,6 @@ async def start_supervisor(
3929
web: Union[str, bool] = "auto",
4030
):
4131
logger.debug("Starting Mars supervisor at %s", address)
42-
if not config or isinstance(config, str):
43-
config = load_config(config)
4432
lookup_address = lookup_address or address
4533
backend = config["cluster"].get("backend", "fixed")
4634
if backend == "fixed" and config["cluster"].get("lookup_address") is None:
@@ -68,8 +56,6 @@ async def start_supervisor(
6856

6957

7058
async def stop_supervisor(address: str, config: Dict = None):
71-
if not config or isinstance(config, str):
72-
config = load_config(config)
7359
await stop_services(NodeRole.SUPERVISOR, address=address, config=config)
7460

7561

@@ -82,8 +68,6 @@ async def start_worker(
8268
mark_ready: bool = True,
8369
):
8470
logger.debug("Starting Mars worker at %s", address)
85-
if not config or isinstance(config, str):
86-
config = load_config(config)
8771
backend = config["cluster"].get("backend", "fixed")
8872
if backend == "fixed" and config["cluster"].get("lookup_address") is None:
8973
config["cluster"]["lookup_address"] = lookup_address
@@ -103,6 +87,4 @@ async def start_worker(
10387

10488

10589
async def stop_worker(address: str, config: Dict = None):
106-
if not config or isinstance(config, str):
107-
config = load_config(config)
10890
await stop_services(NodeRole.WORKER, address=address, config=config)

mars/deploy/oscar/tests/local_test_with_ray_dag_config.yml

Lines changed: 0 additions & 10 deletions
This file was deleted.

mars/deploy/oscar/tests/test_checked_session.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from ....core import TileableType, OperandType
2323
from ....services.task.supervisor.tests import CheckedTaskPreprocessor
2424
from ....services.subtask.worker.tests import CheckedSubtaskProcessor
25-
from ..service import load_config
25+
from ..local import _load_config
2626
from ..tests.session import new_test_session, CONFIG_FILE
2727

2828

@@ -58,7 +58,7 @@ def test_checked_session(setup):
5858

5959

6060
def test_check_task_preprocessor(setup):
61-
config = load_config(CONFIG_FILE)
61+
config = _load_config(CONFIG_FILE)
6262
config["task"][
6363
"task_preprocessor_cls"
6464
] = "mars.deploy.oscar.tests.test_checked_session.FakeCheckedTaskPreprocessor"
@@ -78,7 +78,7 @@ def test_check_task_preprocessor(setup):
7878

7979

8080
def test_check_subtask_processor(setup):
81-
config = load_config(CONFIG_FILE)
81+
config = _load_config(CONFIG_FILE)
8282
config["subtask"][
8383
"subtask_processor_cls"
8484
] = "mars.deploy.oscar.tests.test_checked_session.FakeCheckedSubtaskProcessor"

mars/deploy/oscar/tests/test_local.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@
3939
from ....services.storage import StorageAPI
4040
from ....tensor.arithmetic.add import TensorAdd
4141
from ....tests.core import mock, check_dict_structure_same, DICT_NOT_EMPTY
42-
from ..local import new_cluster
43-
from ..service import load_config
42+
from ..local import new_cluster, _load_config
4443
from ..session import (
4544
get_default_async_session,
4645
get_default_session,
@@ -614,7 +613,7 @@ def cancel():
614613

615614

616615
def test_load_third_party_modules(cleanup_third_party_modules_output): # noqa: F811
617-
config = load_config()
616+
config = _load_config()
618617

619618
config["third_party_modules"] = set()
620619
with pytest.raises(TypeError, match="set"):
@@ -698,7 +697,7 @@ async def _exec():
698697

699698
@pytest.fixture
700699
async def speculative_cluster():
701-
config = load_config()
700+
config = _load_config()
702701
# coloring based fusion will make subtask too heterogeneous such that the speculative scheduler can't
703702
# get enough homogeneous subtasks to calculate statistics
704703
config["task"]["default_config"]["fuse_enabled"] = False

mars/deploy/oscar/tests/test_ray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@
3737
from ....tests.core import require_ray, mock, DICT_NOT_EMPTY
3838
from ....utils import lazy_import
3939
from ..ray import (
40-
new_cluster,
4140
_load_config,
4241
ClusterStateActor,
42+
new_cluster,
4343
new_cluster_in_ray,
4444
new_ray_session,
4545
)

mars/deploy/oscar/tests/test_ray_dag.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@
2828

2929
ray = lazy_import("ray")
3030

31-
CONFIG_TEST_FILE = os.path.join(
32-
os.path.dirname(__file__), "local_test_with_ray_dag_config.yml"
33-
)
34-
3531
EXPECT_PROFILING_STRUCTURE = {
3632
"supervisor": {
3733
"general": {
@@ -61,7 +57,7 @@ async def create_cluster(request):
6157
start_method = os.environ.get("POOL_START_METHOD", None)
6258
client = await new_cluster(
6359
subprocess_start_method=start_method,
64-
config=CONFIG_TEST_FILE,
60+
backend="ray",
6561
n_worker=2,
6662
n_cpu=2,
6763
use_uvloop=False,

mars/deploy/utils.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import yaml
2121

2222
from ..services import NodeRole
23+
from ..utils import merge_dict, flatten_dict_to_nested_dict
2324

2425
DEFAULT_CONFIG_FILE = os.path.join(
2526
os.path.dirname(os.path.abspath(__file__)), "oscar/config.yml"
@@ -113,6 +114,48 @@ def _clear_meta_cfg(src: Dict):
113114
return cfg
114115

115116

117+
def _merge_config(full_config: Dict, config: Dict) -> Dict:
118+
"""
119+
Merge the config to full_config, the config support flatten key, e.g.
120+
121+
config={
122+
'scheduling.autoscale.enabled': True,
123+
'scheduling.autoscale.scheduler_check_interval': 1,
124+
'scheduling.autoscale.scheduler_backlog_timeout': 1,
125+
'scheduling.autoscale.worker_idle_timeout': 10,
126+
'scheduling.autoscale.min_workers': 1,
127+
'scheduling.autoscale.max_workers': 4
128+
}
129+
"""
130+
if not config:
131+
return full_config
132+
if not isinstance(config, Dict): # pragma: no cover
133+
raise ValueError(
134+
f"The config should be a dict, but the type is {type(config)}."
135+
)
136+
flatten_keys = set(k for k in config.keys() if isinstance(k, str) and "." in k)
137+
nested_flatten_config = flatten_dict_to_nested_dict(
138+
{k: config[k] for k in flatten_keys}
139+
)
140+
nested_config = {k: config[k] for k in config.keys() if k not in flatten_keys}
141+
config = merge_dict(nested_config, nested_flatten_config, overwrite=False)
142+
merge_dict(full_config, config)
143+
return full_config
144+
145+
146+
def load_config(config: Union[str, Dict], default_config_file: str):
147+
"""
148+
Load config based on the default_config.
149+
"""
150+
# use default config
151+
if isinstance(config, str):
152+
filename = config
153+
return load_service_config_file(filename)
154+
else:
155+
full_config = load_service_config_file(default_config_file)
156+
return _merge_config(full_config, config)
157+
158+
116159
async def wait_all_supervisors_ready(endpoint):
117160
"""
118161
Wait till all containers are ready

0 commit comments

Comments
 (0)