mars-project
diff --git a/‎mars/deploy/oscar/base_config.yml‎
Lines changed: 1 addition & 1 deletion b/‎mars/deploy/oscar/base_config.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mars/deploy/oscar/local.py‎
Lines changed: 38 additions & 43 deletions b/‎mars/deploy/oscar/local.py‎
Lines changed: 38 additions & 43 deletions
diff --git a/‎mars/deploy/oscar/tests/session.py‎
Lines changed: 3 additions & 1 deletion b/‎mars/deploy/oscar/tests/session.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mars/deploy/oscar/tests/test_local.py‎
Lines changed: 45 additions & 0 deletions b/‎mars/deploy/oscar/tests/test_local.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎mars/deploy/oscar/tests/test_ray_dag.py‎
Lines changed: 9 additions & 0 deletions b/‎mars/deploy/oscar/tests/test_ray_dag.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎mars/services/task/execution/api.py‎
Lines changed: 112 additions & 5 deletions b/‎mars/services/task/execution/api.py‎
Lines changed: 112 additions & 5 deletions
diff --git a/‎mars/services/task/execution/mars/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎mars/services/task/execution/mars/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -29,7 +29,7 @@ task:
     fuse_enabled: yes
     initial_same_color_num: null
     as_broadcaster_successor_num: null
-  task_executor_config:
+  execution_config:
     backend: mars
 scheduling:
   autoscale:
 
@@ -25,8 +25,9 @@
 from ... import oscar as mo
 from ...core.entrypoints import init_extension_entrypoints
 from ...lib.aio import get_isolation, stop_isolation
-from ...resource import cpu_count, cuda_count, mem_total, Resource
+from ...resource import cpu_count, cuda_count, mem_total
 from ...services import NodeRole
+from ...services.task.execution.api import ExecutionConfig
 from ...typing import ClusterType, ClientType
 from ..utils import get_third_party_modules_from_config, load_config
 from .pool import create_supervisor_actor_pool, create_worker_actor_pool
@@ -82,7 +83,7 @@ async def new_cluster_in_isolation(
         n_supervisor_process,
     )
     await cluster.start()
-    return await LocalClient.create(cluster, backend, timeout)
+    return await LocalClient.create(cluster, timeout)
 
 
 async def new_cluster(
@@ -145,53 +146,54 @@ def __init__(
             subprocess_start_method = (
                 "spawn" if sys.platform == "win32" else "forkserver"
             )
-        # load config file to dict.
         self._address = address
-        self._subprocess_start_method = subprocess_start_method
-        self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
-        if backend is not None:
-            self._config["task"]["task_executor_config"]["backend"] = backend
+        self._n_worker = n_worker
         self._n_cpu = cpu_count() if n_cpu == "auto" else n_cpu
         self._mem_bytes = mem_total() if mem_bytes == "auto" else mem_bytes
+        self._cuda_devices = self._get_cuda_devices(cuda_devices, n_worker)
+        self._subprocess_start_method = subprocess_start_method
+        self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
+        execution_config = ExecutionConfig.from_config(self._config, backend=backend)
+        self._backend = execution_config.backend
+        self._web = web
         self._n_supervisor_process = n_supervisor_process
+
+        execution_config.merge_from(
+            ExecutionConfig.from_params(
+                backend=self._backend,
+                n_worker=self._n_worker,
+                n_cpu=self._n_cpu,
+                mem_bytes=self._mem_bytes,
+                cuda_devices=self._cuda_devices,
+            )
+        )
+
+        self._bands_to_resource = execution_config.get_deploy_band_resources()
+        self._supervisor_pool = None
+        self._worker_pools = []
+        self._exiting_check_task = None
+
+        self.supervisor_address = None
+        self.web_address = None
+
+    @staticmethod
+    def _get_cuda_devices(cuda_devices, n_worker):
         if cuda_devices == "auto":
             total = cuda_count()
             all_devices = np.arange(total)
-            devices_list = [list(arr) for arr in np.array_split(all_devices, n_worker)]
+            return [list(arr) for arr in np.array_split(all_devices, n_worker)]
 
         else:  # pragma: no cover
             if isinstance(cuda_devices[0], int):
                 assert n_worker == 1
-                devices_list = [cuda_devices]
+                return [cuda_devices]
             else:
                 assert len(cuda_devices) == n_worker
-                devices_list = cuda_devices
-
-        self._n_worker = n_worker
-        self._web = web
-        self._bands_to_resource = bands_to_resource = []
-        worker_cpus = self._n_cpu // n_worker
-        if sum(len(devices) for devices in devices_list) == 0:
-            assert worker_cpus > 0, (
-                f"{self._n_cpu} cpus are not enough "
-                f"for {n_worker}, try to decrease workers."
-            )
-        mem_bytes = self._mem_bytes // n_worker
-        for _, devices in zip(range(n_worker), devices_list):
-            worker_band_to_resource = dict()
-            worker_band_to_resource["numa-0"] = Resource(
-                num_cpus=worker_cpus, mem_bytes=mem_bytes
-            )
-            for i in devices:  # pragma: no cover
-                worker_band_to_resource[f"gpu-{i}"] = Resource(num_gpus=1)
-            bands_to_resource.append(worker_band_to_resource)
-        self._supervisor_pool = None
-        self._worker_pools = []
+                return cuda_devices
 
-        self.supervisor_address = None
-        self.web_address = None
-
-        self._exiting_check_task = None
+    @property
+    def backend(self):
+        return self._backend
 
     @property
     def external_address(self):
@@ -285,18 +287,11 @@ def __init__(self: ClientType, cluster: ClusterType, session: AbstractSession):
     async def create(
         cls,
         cluster: LocalCluster,
-        backend: str = None,
         timeout: float = None,
     ) -> ClientType:
-        if backend is None:
-            backend = (
-                cluster._config.get("task", {})
-                .get("task_executor_config", {})
-                .get("backend", "mars")
-            )
         session = await _new_session(
             cluster.external_address,
-            backend=backend,
+            backend=cluster.backend,
             default=True,
             timeout=timeout,
         )
 
@@ -119,7 +119,9 @@ async def _new_test_cluster_in_isolation(**new_cluster_kwargs):
             if k in kwargs:
                 new_cluster_params[k] = kwargs.pop(k)
         return (
-            await _new_test_cluster_in_isolation(address=address, **new_cluster_params)
+            await _new_test_cluster_in_isolation(
+                address=address, backend=backend, **new_cluster_params
+            )
         ).session
     return await _get_checked_session(address)
 
 
@@ -55,6 +55,7 @@
     _IsolatedWebSession,
     _execute_with_progress,
 )
+from ..tests.session import new_test_session
 from .modules.utils import (  # noqa: F401; pylint: disable=unused-variable
     cleanup_third_party_modules_output,
     get_output_filenames,
@@ -108,6 +109,8 @@ async def create_cluster(request):
         config = CONFIG_TEST_FILE
     elif request.param == "vineyard":
         config = CONFIG_VINEYARD_TEST_FILE
+    else:
+        config = None
     start_method = os.environ.get("POOL_START_METHOD", None)
     client = await new_cluster(
         subprocess_start_method=start_method,
@@ -135,6 +138,48 @@ async def _assert(session_id: str, addr: str, level: StorageLevel):
     ).result()
 
 
+@pytest.mark.parametrize("backend", ["mars"])
+@pytest.mark.parametrize("_new_session", [new_session, new_test_session])
+def test_new_session_backend(_new_session, backend):
+    from ....services.task.execution.api import _name_to_config_cls
+
+    config_cls = _name_to_config_cls[backend]
+    original_config_init = config_cls.__init__
+    original_deploy_band_resources = config_cls.get_deploy_band_resources
+    with mock.patch.object(
+        config_cls, "__init__", autospec=True
+    ) as config_init, mock.patch.object(
+        config_cls, "get_deploy_band_resources", autospec=True
+    ) as deploy_band_resources:
+        return_deploy_band_resources = []
+
+        def _wrap_original_deploy_band_resources(*args, **kwargs):
+            nonlocal return_deploy_band_resources
+            return_deploy_band_resources = original_deploy_band_resources(
+                *args, **kwargs
+            )
+            return return_deploy_band_resources
+
+        config_init.side_effect = original_config_init
+        deploy_band_resources.side_effect = _wrap_original_deploy_band_resources
+        sess = _new_session(
+            backend=backend, n_cpu=2, web=False, use_uvloop=False, default=True
+        )
+        try:
+            assert config_init.call_count > 0
+            assert deploy_band_resources.call_count > 0
+            worker_pools = sess.default.client._cluster._worker_pools
+            assert len(worker_pools) == len(return_deploy_band_resources)
+            a = mt.ones((10, 10))
+            b = a + 1
+            res = b.to_numpy()
+            np.testing.assert_array_equal(res, np.ones((10, 10)) + 1)
+        finally:
+            sess.stop_server()
+
+    assert get_default_async_session() is None
+
+
 @pytest.mark.asyncio
 async def test_vineyard_operators(create_cluster):
     param = create_cluster[1]
 
@@ -20,7 +20,9 @@
 from ....tests.core import DICT_NOT_EMPTY, require_ray
 from ....utils import lazy_import
 from ..local import new_cluster
+from ..session import new_session
 from ..tests import test_local
+from ..tests.session import new_test_session
 from .modules.utils import (  # noqa: F401; pylint: disable=unused-variable
     cleanup_third_party_modules_output,
     get_output_filenames,
@@ -67,6 +69,13 @@ async def create_cluster(request):
         yield client, {}
 
 
+@require_ray
+@pytest.mark.parametrize("backend", ["ray"])
+@pytest.mark.parametrize("_new_session", [new_session, new_test_session])
+def test_new_session_backend(ray_start_regular_shared2, _new_session, backend):
+    test_local.test_new_session_backend(_new_session, backend)
+
+
 @require_ray
 @pytest.mark.parametrize(
     "config",
 
@@ -14,14 +14,121 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Dict, Any, Type
+from typing import List, Dict, Any, Type, Union
 
 from ....core import ChunkGraph, Chunk, TileContext
 from ....resource import Resource
 from ....typing import BandType
+from ....utils import merge_dict
 from ...subtask import SubtaskGraph, SubtaskResult
 
 
+class ExecutionConfig:
+    """
+    The config for execution backends.
+
+    This class should ONLY provide the APIs for the parts other than
+    just the execution. Each backend may have a different implementation
+    of the API.
+
+    If some configuration is for a specific backend. They should be in
+    the backend config. e.g. `get_mars_special_config()` should be in
+    the `MarsExecutionConfig`.
+    """
+
+    name = None
+
+    def __init__(self, execution_config: Dict):
+        """
+        An example of execution_config:
+        {
+            "backend": "mars",
+            "mars": {
+                "n_worker": 1,
+                "n_cpu": 2,
+                ...
+            },
+        }
+        """
+        self._execution_config = execution_config
+
+    def merge_from(self, execution_config: "ExecutionConfig") -> "ExecutionConfig":
+        assert isinstance(execution_config, ExecutionConfig)
+        assert self.backend == execution_config.backend
+        merge_dict(
+            self._execution_config,
+            execution_config.get_execution_config(),
+        )
+        return self
+
+    @property
+    def backend(self) -> str:
+        """The backend from config."""
+        return self._execution_config["backend"]
+
+    def get_execution_config(self) -> Dict:
+        """Get the execution config dict."""
+        return self._execution_config
+
+    @abstractmethod
+    def get_deploy_band_resources(self) -> List[Dict[str, Resource]]:
+        """Get the band resources for deployment."""
+
+    @classmethod
+    def from_config(cls, config: Dict, backend: str = None) -> "ExecutionConfig":
+        """Construct an execution config instance from config."""
+        execution_config = config["task"]["execution_config"]
+        return cls.from_execution_config(execution_config, backend)
+
+    @classmethod
+    def from_execution_config(
+        cls, execution_config: Union[Dict, "ExecutionConfig"], backend: str = None
+    ) -> "ExecutionConfig":
+        """Construct an execution config instance from execution config."""
+        if isinstance(execution_config, ExecutionConfig):
+            assert backend is None
+            return execution_config
+        if backend is not None:
+            name = execution_config["backend"] = backend
+        else:
+            name = execution_config.setdefault("backend", "mars")
+        config_cls = _name_to_config_cls[name]
+        return config_cls(execution_config)
+
+    @classmethod
+    def from_params(
+        cls,
+        backend: str,
+        n_worker: int,
+        n_cpu: int,
+        mem_bytes: int = 0,
+        cuda_devices: List[List[int]] = None,
+        **kwargs,
+    ) -> "ExecutionConfig":
+        """Construct an execution config instance from params."""
+        execution_config = {
+            "backend": backend,
+            backend: dict(
+                {
+                    "n_worker": n_worker,
+                    "n_cpu": n_cpu,
+                    "mem_bytes": mem_bytes,
+                    "cuda_devices": cuda_devices,
+                },
+                **kwargs,
+            ),
+        }
+        return cls.from_execution_config(execution_config)
+
+
+_name_to_config_cls: Dict[str, Type[ExecutionConfig]] = {}
+
+
+def register_config_cls(config_cls: Type[ExecutionConfig]):
+    _name_to_config_cls[config_cls.name] = config_cls
+    return config_cls
+
+
 @dataclass
 class ExecutionChunkResult:
     meta: Dict  # The chunk meta for iterative tiling.
@@ -35,17 +142,16 @@ class TaskExecutor(ABC):
     @abstractmethod
     async def create(
         cls,
-        config: Dict,
+        config: Union[Dict, ExecutionConfig],
         *,
         session_id: str,
         address: str,
         task,
         tile_context: TileContext,
         **kwargs,
     ) -> "TaskExecutor":
-        name = config.get("backend", "mars")
-        backend_config = config.get(name, {})
-        executor_cls = _name_to_task_executor_cls[name]
+        backend_config = ExecutionConfig.from_execution_config(config)
+        executor_cls = _name_to_task_executor_cls[backend_config.backend]
         if executor_cls.create.__func__ is TaskExecutor.create.__func__:
             raise NotImplementedError(
                 f"The {executor_cls} should implement the abstract classmethod `create`."
@@ -102,6 +208,7 @@ def get_stage_processors(self):
 
 def register_executor_cls(executor_cls: Type[TaskExecutor]):
     _name_to_task_executor_cls[executor_cls.name] = executor_cls
+    return executor_cls
 
 
 class Fetcher:
 
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .config import MarsExecutionConfig
 from .executor import MarsTaskExecutor
 from .fetcher import MarsFetcher