|
25 | 25 | from ... import oscar as mo |
26 | 26 | from ...core.entrypoints import init_extension_entrypoints |
27 | 27 | from ...lib.aio import get_isolation, stop_isolation |
28 | | -from ...resource import cpu_count, cuda_count, mem_total, Resource |
| 28 | +from ...resource import cpu_count, cuda_count, mem_total |
29 | 29 | from ...services import NodeRole |
| 30 | +from ...services.task.execution.api import ExecutionConfig |
30 | 31 | from ...typing import ClusterType, ClientType |
31 | 32 | from ..utils import get_third_party_modules_from_config, load_config |
32 | 33 | from .pool import create_supervisor_actor_pool, create_worker_actor_pool |
@@ -82,7 +83,7 @@ async def new_cluster_in_isolation( |
82 | 83 | n_supervisor_process, |
83 | 84 | ) |
84 | 85 | await cluster.start() |
85 | | - return await LocalClient.create(cluster, backend, timeout) |
| 86 | + return await LocalClient.create(cluster, timeout) |
86 | 87 |
|
87 | 88 |
|
88 | 89 | async def new_cluster( |
@@ -145,53 +146,54 @@ def __init__( |
145 | 146 | subprocess_start_method = ( |
146 | 147 | "spawn" if sys.platform == "win32" else "forkserver" |
147 | 148 | ) |
148 | | - # load config file to dict. |
149 | 149 | self._address = address |
150 | | - self._subprocess_start_method = subprocess_start_method |
151 | | - self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE) |
152 | | - if backend is not None: |
153 | | - self._config["task"]["task_executor_config"]["backend"] = backend |
| 150 | + self._n_worker = n_worker |
154 | 151 | self._n_cpu = cpu_count() if n_cpu == "auto" else n_cpu |
155 | 152 | self._mem_bytes = mem_total() if mem_bytes == "auto" else mem_bytes |
| 153 | + self._cuda_devices = self._get_cuda_devices(cuda_devices, n_worker) |
| 154 | + self._subprocess_start_method = subprocess_start_method |
| 155 | + self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE) |
| 156 | + execution_config = ExecutionConfig.from_config(self._config, backend=backend) |
| 157 | + self._backend = execution_config.backend |
| 158 | + self._web = web |
156 | 159 | self._n_supervisor_process = n_supervisor_process |
| 160 | + |
| 161 | + execution_config.merge_from( |
| 162 | + ExecutionConfig.from_params( |
| 163 | + backend=self._backend, |
| 164 | + n_worker=self._n_worker, |
| 165 | + n_cpu=self._n_cpu, |
| 166 | + mem_bytes=self._mem_bytes, |
| 167 | + cuda_devices=self._cuda_devices, |
| 168 | + ) |
| 169 | + ) |
| 170 | + |
| 171 | + self._bands_to_resource = execution_config.get_deploy_band_resources() |
| 172 | + self._supervisor_pool = None |
| 173 | + self._worker_pools = [] |
| 174 | + self._exiting_check_task = None |
| 175 | + |
| 176 | + self.supervisor_address = None |
| 177 | + self.web_address = None |
| 178 | + |
| 179 | + @staticmethod |
| 180 | + def _get_cuda_devices(cuda_devices, n_worker): |
157 | 181 | if cuda_devices == "auto": |
158 | 182 | total = cuda_count() |
159 | 183 | all_devices = np.arange(total) |
160 | | - devices_list = [list(arr) for arr in np.array_split(all_devices, n_worker)] |
| 184 | + return [list(arr) for arr in np.array_split(all_devices, n_worker)] |
161 | 185 |
|
162 | 186 | else: # pragma: no cover |
163 | 187 | if isinstance(cuda_devices[0], int): |
164 | 188 | assert n_worker == 1 |
165 | | - devices_list = [cuda_devices] |
| 189 | + return [cuda_devices] |
166 | 190 | else: |
167 | 191 | assert len(cuda_devices) == n_worker |
168 | | - devices_list = cuda_devices |
169 | | - |
170 | | - self._n_worker = n_worker |
171 | | - self._web = web |
172 | | - self._bands_to_resource = bands_to_resource = [] |
173 | | - worker_cpus = self._n_cpu // n_worker |
174 | | - if sum(len(devices) for devices in devices_list) == 0: |
175 | | - assert worker_cpus > 0, ( |
176 | | - f"{self._n_cpu} cpus are not enough " |
177 | | - f"for {n_worker}, try to decrease workers." |
178 | | - ) |
179 | | - mem_bytes = self._mem_bytes // n_worker |
180 | | - for _, devices in zip(range(n_worker), devices_list): |
181 | | - worker_band_to_resource = dict() |
182 | | - worker_band_to_resource["numa-0"] = Resource( |
183 | | - num_cpus=worker_cpus, mem_bytes=mem_bytes |
184 | | - ) |
185 | | - for i in devices: # pragma: no cover |
186 | | - worker_band_to_resource[f"gpu-{i}"] = Resource(num_gpus=1) |
187 | | - bands_to_resource.append(worker_band_to_resource) |
188 | | - self._supervisor_pool = None |
189 | | - self._worker_pools = [] |
| 192 | + return cuda_devices |
190 | 193 |
|
191 | | - self.supervisor_address = None |
192 | | - self.web_address = None |
193 | | - |
194 | | - self._exiting_check_task = None |
| 194 | + @property |
| 195 | + def backend(self): |
| 196 | + return self._backend |
195 | 197 |
|
196 | 198 | @property |
197 | 199 | def external_address(self): |
@@ -285,18 +287,11 @@ def __init__(self: ClientType, cluster: ClusterType, session: AbstractSession): |
285 | 287 | async def create( |
286 | 288 | cls, |
287 | 289 | cluster: LocalCluster, |
288 | | - backend: str = None, |
289 | 290 | timeout: float = None, |
290 | 291 | ) -> ClientType: |
291 | | - if backend is None: |
292 | | - backend = ( |
293 | | - cluster._config.get("task", {}) |
294 | | - .get("task_executor_config", {}) |
295 | | - .get("backend", "mars") |
296 | | - ) |
297 | 292 | session = await _new_session( |
298 | 293 | cluster.external_address, |
299 | | - backend=backend, |
| 294 | + backend=cluster.backend, |
300 | 295 | default=True, |
301 | 296 | timeout=timeout, |
302 | 297 | ) |
|
0 commit comments