|
5 | 5 | "execution_count": null, |
6 | 6 | "id": "c443b989-5a71-455f-9a59-9963338634ec", |
7 | 7 | "metadata": {}, |
8 | | - "outputs": [ |
9 | | - { |
10 | | - "name": "stderr", |
11 | | - "output_type": "stream", |
12 | | - "text": [ |
13 | | - "<string>:8: FutureWarning: Setting `workspace='/home/ubuntu/ahmads/monarch/examples'` is deprecated.\n", |
14 | | - "torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `squeue` (squeue: error: Invalid job id: monarch-ubuntu\n", |
15 | | - "), trying `sacct`\n", |
16 | | - "torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `sacct` (sacct: fatal: Bad job/step specified: monarch-ubuntu\n", |
17 | | - ")\n", |
18 | | - "monarch.tools.commands 2025-08-29 21:02:20 INFO no existing RUNNING server `slurm:///monarch-ubuntu` creating new one...\n", |
19 | | - "torchx.runner.api 2025-08-29 21:02:20 INFO Tracker configurations: {}\n", |
20 | | - "torchx.runner.api 2025-08-29 21:02:20 INFO Checking for changes in workspace `/home/ubuntu/.monarch/out/tmpkk97qppi/workspace`...\n", |
21 | | - "torchx.runner.api 2025-08-29 21:02:20 INFO To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n", |
22 | | - "torchx.runner.api 2025-08-29 21:02:20 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.\n", |
23 | | - "monarch.tools.commands 2025-08-29 21:02:20 INFO created new `slurm:///418` waiting for it to be ready...\n" |
24 | | - ] |
25 | | - }, |
26 | | - { |
27 | | - "name": "stdout", |
28 | | - "output_type": "stream", |
29 | | - "text": [ |
30 | | - "Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n", |
31 | | - "Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n", |
32 | | - "Waiting for slurm:///418 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:20.105986\r" |
33 | | - ] |
34 | | - }, |
35 | | - { |
36 | | - "name": "stderr", |
37 | | - "output_type": "stream", |
38 | | - "text": [ |
39 | | - "slurm.utils 2025-08-29 21:02:45 INFO \n", |
40 | | - "===== Server Info =====\n", |
41 | | - "{\n", |
42 | | - " \"name\": \"418\",\n", |
43 | | - " \"server_handle\": \"slurm:///418\",\n", |
44 | | - " \"state\": \"RUNNING\",\n", |
45 | | - " \"meshes\": {\n", |
46 | | - " \"mesh0\": {\n", |
47 | | - " \"host_type\": \"__UNSET__\",\n", |
48 | | - " \"hosts\": 2,\n", |
49 | | - " \"gpus\": -1,\n", |
50 | | - " \"hostnames\": [\n", |
51 | | - " \"gpu-queue-st-gpu-compute-1\",\n", |
52 | | - " \"gpu-queue-st-gpu-compute-2\"\n", |
53 | | - " ]\n", |
54 | | - " }\n", |
55 | | - " }\n", |
56 | | - "}\n", |
57 | | - "__main__ 2025-08-29 21:02:45 INFO computing world size...\n", |
58 | | - "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n", |
59 | | - "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n", |
60 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n", |
61 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n", |
62 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n", |
63 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n", |
64 | | - "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n", |
65 | | - "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n", |
66 | | - "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n", |
67 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n", |
68 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n", |
69 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n", |
70 | | - "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n", |
71 | | - "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n" |
72 | | - ] |
73 | | - }, |
74 | | - { |
75 | | - "name": "stdout", |
76 | | - "output_type": "stream", |
77 | | - "text": [ |
78 | | - "New job `slurm:///418` is ready to serve.\n" |
79 | | - ] |
80 | | - }, |
81 | | - { |
82 | | - "name": "stderr", |
83 | | - "output_type": "stream", |
84 | | - "text": [ |
85 | | - "__main__ 2025-08-29 21:02:53 INFO computed world_sizes:\n", |
86 | | - " ----------------------------------------\n", |
87 | | - " {\n", |
88 | | - " \"rank_0\": 8,\n", |
89 | | - " \"rank_1\": 8,\n", |
90 | | - " \"rank_2\": 8,\n", |
91 | | - " \"rank_3\": 8,\n", |
92 | | - " \"rank_4\": 8,\n", |
93 | | - " \"rank_5\": 8,\n", |
94 | | - " \"rank_6\": 8,\n", |
95 | | - " \"rank_7\": 8\n", |
96 | | - "}\n", |
97 | | - " ----------------------------------------\n" |
98 | | - ] |
99 | | - }, |
100 | | - { |
101 | | - "name": "stdout", |
102 | | - "output_type": "stream", |
103 | | - "text": [ |
104 | | - "\u001b[36m>>> Aggregated Logs (2025-08-29 21:02:51) >>>\u001b[0m\n", |
105 | | - "\u001b[33m[8 similar log lines]\u001b[0m Initializing process group `nccl`:\n", |
106 | | - "\u001b[33m[8 similar log lines]\u001b[0m MASTER_ADDR = gpu-queue-st-gpu-compute-1\n", |
107 | | - "\u001b[33m[8 similar log lines]\u001b[0m MASTER_PORT = 29500\n", |
108 | | - "\u001b[33m[8 similar log lines]\u001b[0m RANK = 5\n", |
109 | | - "\u001b[33m[8 similar log lines]\u001b[0m WORLD_SIZE = 8\n", |
110 | | - "\u001b[36m<<< Aggregated Logs (2025-08-29 21:02:54) <<<\u001b[0m\n", |
111 | | - "\n" |
112 | | - ] |
113 | | - } |
114 | | - ], |
| 8 | + "outputs": [], |
115 | 9 | "source": [ |
116 | 10 | "# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.\n", |
117 | 11 | "\n", |
118 | 12 | "# @noautodeps\n", |
119 | 13 | "# pyre-ignore-all-errors\n", |
120 | 14 | "import json\n", |
121 | 15 | "import logging\n", |
| 16 | + "import socket\n", |
122 | 17 | "import sys\n", |
123 | 18 | "\n", |
124 | 19 | "import cloudpickle\n", |
125 | | - "from monarch.tools import commands\n", |
126 | 20 | "from example_actors.compute_world_size_actor import ComputeWorldSizeActor\n", |
127 | | - "from slurm.utils import get_appdef, get_server_info, create_proc_mesh\n", |
| 21 | + "from monarch.actor import Actor, endpoint\n", |
| 22 | + "from monarch.job import SlurmJob\n", |
128 | 23 | "\n", |
129 | 24 | "\n", |
130 | 25 | "logging.basicConfig(\n", |
|
138 | 33 | "logger: logging.Logger = logging.getLogger(__name__)\n", |
139 | 34 | "\n", |
140 | 35 | "\n", |
| 36 | + "class _HostnameActor(Actor):\n", |
| 37 | + " \"\"\"Helper actor to get hostname from rank 0\"\"\"\n", |
| 38 | + " @endpoint\n", |
| 39 | + " def get_hostname(self) -> str:\n", |
| 40 | + " return socket.gethostname()\n", |
| 41 | + "\n", |
| 42 | + "\n", |
141 | 43 | "async def main():\n", |
142 | | - " num_hosts = 2\n", |
143 | | - " appdef = await get_appdef(num_hosts)\n", |
144 | | - " server_info = await get_server_info(appdef)\n", |
| 44 | + " num_nodes = 2\n", |
| 45 | + " gpus_per_node = 8\n", |
| 46 | + " mesh_name = \"mesh0\"\n", |
| 47 | + " master_port = 29500\n", |
| 48 | + " \n", |
| 49 | + " # Create SLURM job\n", |
| 50 | + " slurm_job = SlurmJob(\n", |
| 51 | + " meshes={mesh_name: num_nodes},\n", |
| 52 | + " job_name=\"monarch_example\",\n", |
| 53 | + " gpus_per_node=gpus_per_node,\n", |
| 54 | + " time_limit=\"06:00:00\",\n", |
| 55 | + " )\n", |
145 | 56 | "\n", |
146 | 57 | " try:\n", |
147 | | - " proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)\n", |
148 | | - " actor = await proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n", |
| 58 | + " # Get job state and create process mesh\n", |
| 59 | + " job_state = slurm_job.state()\n", |
| 60 | + " proc_mesh = job_state.mesh0.spawn_procs({\"gpus\": gpus_per_node})\n", |
| 61 | + " \n", |
| 62 | + " # Get master_addr from rank 0\n", |
| 63 | + " hostname_actor = proc_mesh.spawn(\"hostname_actor\", _HostnameActor)\n", |
| 64 | + " hostname_values = await hostname_actor.flatten(\"rank\").slice(rank=0).get_hostname.call()\n", |
| 65 | + " master_addr = hostname_values.item()\n", |
| 66 | + " \n", |
| 67 | + " # Spawn actor\n", |
| 68 | + " actor = proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n", |
149 | 69 | "\n", |
150 | 70 | " logger.info(\"computing world size...\")\n", |
151 | 71 | " # this is redundant but is here for example sake\n", |
152 | | - " mesh_name = server_info.get_mesh_spec(\"mesh0\").name\n", |
153 | 72 | " values = await actor.compute_world_size.call(\n", |
154 | | - " master_addr=server_info.host0(mesh_name),\n", |
155 | | - " master_port=29500,\n", |
| 73 | + " master_addr=master_addr,\n", |
| 74 | + " master_port=master_port,\n", |
156 | 75 | " )\n", |
157 | 76 | "\n", |
158 | 77 | " values_by_rank = {f\"rank_{p.rank}\": v for p, v in list(values.flatten(\"rank\"))}\n", |
|
164 | 83 | " {'-'*40}\"\"\"\n", |
165 | 84 | " )\n", |
166 | 85 | " finally:\n", |
167 | | - " commands.kill(f\"slurm:///{server_info.name}\")\n", |
| 86 | + " # Cancel the SLURM job, releasing all reserved nodes back to the cluster\n", |
| 87 | + " slurm_job.kill()\n", |
| 88 | + " logger.info(\"Job terminated successfully\")\n", |
168 | 89 | "\n", |
169 | 90 | "\n", |
170 | 91 | "if __name__ == \"__main__\":\n", |
171 | 92 | " cloudpickle.register_pickle_by_value(sys.modules[ComputeWorldSizeActor.__module__])\n", |
172 | 93 | "\n", |
173 | 94 | " await main()" |
174 | 95 | ] |
| 96 | + }, |
| 97 | + { |
| 98 | + "cell_type": "code", |
| 99 | + "execution_count": null, |
| 100 | + "id": "71ca61d9", |
| 101 | + "metadata": {}, |
| 102 | + "outputs": [], |
| 103 | + "source": [] |
175 | 104 | } |
176 | 105 | ], |
177 | 106 | "metadata": { |
|
0 commit comments