Skip to content

Commit 199aff7

Browse files
mresometa-codesync[bot]
authored andcommitted
Update slurm examples to v1 (#1901)
Summary: This PR updates the slurm examples to use v1 api (SlurmJob + jobTrait) Testplan: Tested on GB200 cluster, see notebook output cc chriscai-amd Pull Request resolved: #1901 Reviewed By: dulinriley Differential Revision: D87157070 Pulled By: amirafzali fbshipit-source-id: 228ab64808604f74b554b6a7a8f50535bcb23964
1 parent 068c505 commit 199aff7

File tree

5 files changed

+340
-733
lines changed

5 files changed

+340
-733
lines changed

examples/slurm/__init__.py

Whitespace-only changes.

examples/slurm/utils.py

Lines changed: 0 additions & 92 deletions
This file was deleted.

examples/slurm_allreduce.ipynb

Lines changed: 47 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -5,126 +5,21 @@
55
"execution_count": null,
66
"id": "c443b989-5a71-455f-9a59-9963338634ec",
77
"metadata": {},
8-
"outputs": [
9-
{
10-
"name": "stderr",
11-
"output_type": "stream",
12-
"text": [
13-
"<string>:8: FutureWarning: Setting `workspace='/home/ubuntu/ahmads/monarch/examples'` is deprecated.\n",
14-
"torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `squeue` (squeue: error: Invalid job id: monarch-ubuntu\n",
15-
"), trying `sacct`\n",
16-
"torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `sacct` (sacct: fatal: Bad job/step specified: monarch-ubuntu\n",
17-
")\n",
18-
"monarch.tools.commands 2025-08-29 21:02:20 INFO no existing RUNNING server `slurm:///monarch-ubuntu` creating new one...\n",
19-
"torchx.runner.api 2025-08-29 21:02:20 INFO Tracker configurations: {}\n",
20-
"torchx.runner.api 2025-08-29 21:02:20 INFO Checking for changes in workspace `/home/ubuntu/.monarch/out/tmpkk97qppi/workspace`...\n",
21-
"torchx.runner.api 2025-08-29 21:02:20 INFO To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n",
22-
"torchx.runner.api 2025-08-29 21:02:20 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.\n",
23-
"monarch.tools.commands 2025-08-29 21:02:20 INFO created new `slurm:///418` waiting for it to be ready...\n"
24-
]
25-
},
26-
{
27-
"name": "stdout",
28-
"output_type": "stream",
29-
"text": [
30-
"Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n",
31-
"Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n",
32-
"Waiting for slurm:///418 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:20.105986\r"
33-
]
34-
},
35-
{
36-
"name": "stderr",
37-
"output_type": "stream",
38-
"text": [
39-
"slurm.utils 2025-08-29 21:02:45 INFO \n",
40-
"===== Server Info =====\n",
41-
"{\n",
42-
" \"name\": \"418\",\n",
43-
" \"server_handle\": \"slurm:///418\",\n",
44-
" \"state\": \"RUNNING\",\n",
45-
" \"meshes\": {\n",
46-
" \"mesh0\": {\n",
47-
" \"host_type\": \"__UNSET__\",\n",
48-
" \"hosts\": 2,\n",
49-
" \"gpus\": -1,\n",
50-
" \"hostnames\": [\n",
51-
" \"gpu-queue-st-gpu-compute-1\",\n",
52-
" \"gpu-queue-st-gpu-compute-2\"\n",
53-
" ]\n",
54-
" }\n",
55-
" }\n",
56-
"}\n",
57-
"__main__ 2025-08-29 21:02:45 INFO computing world size...\n",
58-
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n",
59-
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n",
60-
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n",
61-
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n",
62-
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n",
63-
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n",
64-
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n",
65-
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n",
66-
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n",
67-
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n",
68-
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n",
69-
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n",
70-
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n",
71-
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n"
72-
]
73-
},
74-
{
75-
"name": "stdout",
76-
"output_type": "stream",
77-
"text": [
78-
"New job `slurm:///418` is ready to serve.\n"
79-
]
80-
},
81-
{
82-
"name": "stderr",
83-
"output_type": "stream",
84-
"text": [
85-
"__main__ 2025-08-29 21:02:53 INFO computed world_sizes:\n",
86-
" ----------------------------------------\n",
87-
" {\n",
88-
" \"rank_0\": 8,\n",
89-
" \"rank_1\": 8,\n",
90-
" \"rank_2\": 8,\n",
91-
" \"rank_3\": 8,\n",
92-
" \"rank_4\": 8,\n",
93-
" \"rank_5\": 8,\n",
94-
" \"rank_6\": 8,\n",
95-
" \"rank_7\": 8\n",
96-
"}\n",
97-
" ----------------------------------------\n"
98-
]
99-
},
100-
{
101-
"name": "stdout",
102-
"output_type": "stream",
103-
"text": [
104-
"\u001b[36m>>> Aggregated Logs (2025-08-29 21:02:51) >>>\u001b[0m\n",
105-
"\u001b[33m[8 similar log lines]\u001b[0m Initializing process group `nccl`:\n",
106-
"\u001b[33m[8 similar log lines]\u001b[0m MASTER_ADDR = gpu-queue-st-gpu-compute-1\n",
107-
"\u001b[33m[8 similar log lines]\u001b[0m MASTER_PORT = 29500\n",
108-
"\u001b[33m[8 similar log lines]\u001b[0m RANK = 5\n",
109-
"\u001b[33m[8 similar log lines]\u001b[0m WORLD_SIZE = 8\n",
110-
"\u001b[36m<<< Aggregated Logs (2025-08-29 21:02:54) <<<\u001b[0m\n",
111-
"\n"
112-
]
113-
}
114-
],
8+
"outputs": [],
1159
"source": [
11610
"# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.\n",
11711
"\n",
11812
"# @noautodeps\n",
11913
"# pyre-ignore-all-errors\n",
12014
"import json\n",
12115
"import logging\n",
16+
"import socket\n",
12217
"import sys\n",
12318
"\n",
12419
"import cloudpickle\n",
125-
"from monarch.tools import commands\n",
12620
"from example_actors.compute_world_size_actor import ComputeWorldSizeActor\n",
127-
"from slurm.utils import get_appdef, get_server_info, create_proc_mesh\n",
21+
"from monarch.actor import Actor, endpoint\n",
22+
"from monarch.job import SlurmJob\n",
12823
"\n",
12924
"\n",
13025
"logging.basicConfig(\n",
@@ -138,21 +33,45 @@
13833
"logger: logging.Logger = logging.getLogger(__name__)\n",
13934
"\n",
14035
"\n",
36+
"class _HostnameActor(Actor):\n",
37+
" \"\"\"Helper actor to get hostname from rank 0\"\"\"\n",
38+
" @endpoint\n",
39+
" def get_hostname(self) -> str:\n",
40+
" return socket.gethostname()\n",
41+
"\n",
42+
"\n",
14143
"async def main():\n",
142-
" num_hosts = 2\n",
143-
" appdef = await get_appdef(num_hosts)\n",
144-
" server_info = await get_server_info(appdef)\n",
44+
" num_nodes = 2\n",
45+
" gpus_per_node = 8\n",
46+
" mesh_name = \"mesh0\"\n",
47+
" master_port = 29500\n",
48+
" \n",
49+
" # Create SLURM job\n",
50+
" slurm_job = SlurmJob(\n",
51+
" meshes={mesh_name: num_nodes},\n",
52+
" job_name=\"monarch_example\",\n",
53+
" gpus_per_node=gpus_per_node,\n",
54+
" time_limit=\"06:00:00\",\n",
55+
" )\n",
14556
"\n",
14657
" try:\n",
147-
" proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)\n",
148-
" actor = await proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n",
58+
" # Get job state and create process mesh\n",
59+
" job_state = slurm_job.state()\n",
60+
" proc_mesh = job_state.mesh0.spawn_procs({\"gpus\": gpus_per_node})\n",
61+
" \n",
62+
" # Get master_addr from rank 0\n",
63+
" hostname_actor = proc_mesh.spawn(\"hostname_actor\", _HostnameActor)\n",
64+
" hostname_values = await hostname_actor.flatten(\"rank\").slice(rank=0).get_hostname.call()\n",
65+
" master_addr = hostname_values.item()\n",
66+
" \n",
67+
" # Spawn actor\n",
68+
" actor = proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n",
14969
"\n",
15070
" logger.info(\"computing world size...\")\n",
15171
" # this is redundant but is here for example sake\n",
152-
" mesh_name = server_info.get_mesh_spec(\"mesh0\").name\n",
15372
" values = await actor.compute_world_size.call(\n",
154-
" master_addr=server_info.host0(mesh_name),\n",
155-
" master_port=29500,\n",
73+
" master_addr=master_addr,\n",
74+
" master_port=master_port,\n",
15675
" )\n",
15776
"\n",
15877
" values_by_rank = {f\"rank_{p.rank}\": v for p, v in list(values.flatten(\"rank\"))}\n",
@@ -164,14 +83,24 @@
16483
" {'-'*40}\"\"\"\n",
16584
" )\n",
16685
" finally:\n",
167-
" commands.kill(f\"slurm:///{server_info.name}\")\n",
86+
" # Cancel the SLURM job, releasing all reserved nodes back to the cluster\n",
87+
" slurm_job.kill()\n",
88+
" logger.info(\"Job terminated successfully\")\n",
16889
"\n",
16990
"\n",
17091
"if __name__ == \"__main__\":\n",
17192
" cloudpickle.register_pickle_by_value(sys.modules[ComputeWorldSizeActor.__module__])\n",
17293
"\n",
17394
" await main()"
17495
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"id": "71ca61d9",
101+
"metadata": {},
102+
"outputs": [],
103+
"source": []
175104
}
176105
],
177106
"metadata": {

0 commit comments

Comments
 (0)