meta-pytorch
diff --git a/‎examples/slurm/__init__.py‎ b/‎examples/slurm/__init__.py‎
diff --git a/‎examples/slurm/utils.py‎
Lines changed: 0 additions & 92 deletions b/‎examples/slurm/utils.py‎
Lines changed: 0 additions & 92 deletions
diff --git a/‎examples/slurm_allreduce.ipynb‎
Lines changed: 47 additions & 118 deletions b/‎examples/slurm_allreduce.ipynb‎
Lines changed: 47 additions & 118 deletions
@@ -5,126 +5,21 @@
    "execution_count": null,
    "id": "c443b989-5a71-455f-9a59-9963338634ec",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<string>:8: FutureWarning: Setting `workspace='/home/ubuntu/ahmads/monarch/examples'` is deprecated.\n",
-      "torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `squeue` (squeue: error: Invalid job id: monarch-ubuntu\n",
-      "), trying `sacct`\n",
-      "torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `sacct` (sacct: fatal: Bad job/step specified: monarch-ubuntu\n",
-      ")\n",
-      "monarch.tools.commands 2025-08-29 21:02:20 INFO no existing RUNNING server `slurm:///monarch-ubuntu` creating new one...\n",
-      "torchx.runner.api 2025-08-29 21:02:20 INFO Tracker configurations: {}\n",
-      "torchx.runner.api 2025-08-29 21:02:20 INFO Checking for changes in workspace `/home/ubuntu/.monarch/out/tmpkk97qppi/workspace`...\n",
-      "torchx.runner.api 2025-08-29 21:02:20 INFO To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n",
-      "torchx.runner.api 2025-08-29 21:02:20 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.\n",
-      "monarch.tools.commands 2025-08-29 21:02:20 INFO created new `slurm:///418` waiting for it to be ready...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n",
-      "Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n",
-      "Waiting for slurm:///418 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:20.105986\r"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "slurm.utils 2025-08-29 21:02:45 INFO \n",
-      "===== Server Info =====\n",
-      "{\n",
-      "  \"name\": \"418\",\n",
-      "  \"server_handle\": \"slurm:///418\",\n",
-      "  \"state\": \"RUNNING\",\n",
-      "  \"meshes\": {\n",
-      "    \"mesh0\": {\n",
-      "      \"host_type\": \"__UNSET__\",\n",
-      "      \"hosts\": 2,\n",
-      "      \"gpus\": -1,\n",
-      "      \"hostnames\": [\n",
-      "        \"gpu-queue-st-gpu-compute-1\",\n",
-      "        \"gpu-queue-st-gpu-compute-2\"\n",
-      "      ]\n",
-      "    }\n",
-      "  }\n",
-      "}\n",
-      "__main__ 2025-08-29 21:02:45 INFO computing world size...\n",
-      "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n",
-      "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n",
-      "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n",
-      "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n",
-      "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n",
-      "monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n",
-      "monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "New job `slurm:///418` is ready to serve.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "__main__ 2025-08-29 21:02:53 INFO computed world_sizes:\n",
-      "    ----------------------------------------\n",
-      "    {\n",
-      "  \"rank_0\": 8,\n",
-      "  \"rank_1\": 8,\n",
-      "  \"rank_2\": 8,\n",
-      "  \"rank_3\": 8,\n",
-      "  \"rank_4\": 8,\n",
-      "  \"rank_5\": 8,\n",
-      "  \"rank_6\": 8,\n",
-      "  \"rank_7\": 8\n",
-      "}\n",
-      "    ----------------------------------------\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m>>> Aggregated Logs (2025-08-29 21:02:51) >>>\u001b[0m\n",
-      "\u001b[33m[8 similar log lines]\u001b[0m Initializing process group `nccl`:\n",
-      "\u001b[33m[8 similar log lines]\u001b[0m   MASTER_ADDR = gpu-queue-st-gpu-compute-1\n",
-      "\u001b[33m[8 similar log lines]\u001b[0m   MASTER_PORT = 29500\n",
-      "\u001b[33m[8 similar log lines]\u001b[0m   RANK        = 5\n",
-      "\u001b[33m[8 similar log lines]\u001b[0m   WORLD_SIZE  = 8\n",
-      "\u001b[36m<<< Aggregated Logs (2025-08-29 21:02:54) <<<\u001b[0m\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.\n",
     "\n",
     "# @noautodeps\n",
     "# pyre-ignore-all-errors\n",
     "import json\n",
     "import logging\n",
+    "import socket\n",
     "import sys\n",
     "\n",
     "import cloudpickle\n",
-    "from monarch.tools import commands\n",
     "from example_actors.compute_world_size_actor import ComputeWorldSizeActor\n",
-    "from slurm.utils import get_appdef, get_server_info, create_proc_mesh\n",
+    "from monarch.actor import Actor, endpoint\n",
+    "from monarch.job import SlurmJob\n",
     "\n",
     "\n",
     "logging.basicConfig(\n",
@@ -138,21 +33,45 @@
     "logger: logging.Logger = logging.getLogger(__name__)\n",
     "\n",
     "\n",
+    "class _HostnameActor(Actor):\n",
+    "    \"\"\"Helper actor to get hostname from rank 0\"\"\"\n",
+    "    @endpoint\n",
+    "    def get_hostname(self) -> str:\n",
+    "        return socket.gethostname()\n",
+    "\n",
+    "\n",
     "async def main():\n",
-    "    num_hosts = 2\n",
-    "    appdef = await get_appdef(num_hosts)\n",
-    "    server_info = await get_server_info(appdef)\n",
+    "    num_nodes = 2\n",
+    "    gpus_per_node = 8\n",
+    "    mesh_name = \"mesh0\"\n",
+    "    master_port = 29500\n",
+    "    \n",
+    "    # Create SLURM job\n",
+    "    slurm_job = SlurmJob(\n",
+    "        meshes={mesh_name: num_nodes},\n",
+    "        job_name=\"monarch_example\",\n",
+    "        gpus_per_node=gpus_per_node,\n",
+    "        time_limit=\"06:00:00\",\n",
+    "    )\n",
     "\n",
     "    try:\n",
-    "        proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)\n",
-    "        actor = await proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n",
+    "        # Get job state and create process mesh\n",
+    "        job_state = slurm_job.state()\n",
+    "        proc_mesh = job_state.mesh0.spawn_procs({\"gpus\": gpus_per_node})\n",
+    "        \n",
+    "        # Get master_addr from rank 0\n",
+    "        hostname_actor = proc_mesh.spawn(\"hostname_actor\", _HostnameActor)\n",
+    "        hostname_values = await hostname_actor.flatten(\"rank\").slice(rank=0).get_hostname.call()\n",
+    "        master_addr = hostname_values.item()\n",
+    "        \n",
+    "        # Spawn actor\n",
+    "        actor = proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n",
     "\n",
     "        logger.info(\"computing world size...\")\n",
     "        # this is redundant but is here for example sake\n",
-    "        mesh_name = server_info.get_mesh_spec(\"mesh0\").name\n",
     "        values = await actor.compute_world_size.call(\n",
-    "            master_addr=server_info.host0(mesh_name),\n",
-    "            master_port=29500,\n",
+    "            master_addr=master_addr,\n",
+    "            master_port=master_port,\n",
     "        )\n",
     "\n",
     "        values_by_rank = {f\"rank_{p.rank}\": v for p, v in list(values.flatten(\"rank\"))}\n",
@@ -164,14 +83,24 @@
     "    {'-'*40}\"\"\"\n",
     "        )\n",
     "    finally:\n",
-    "        commands.kill(f\"slurm:///{server_info.name}\")\n",
+    "        # Cancel the SLURM job, releasing all reserved nodes back to the cluster\n",
+    "        slurm_job.kill()\n",
+    "        logger.info(\"Job terminated successfully\")\n",
     "\n",
     "\n",
     "if __name__ == \"__main__\":\n",
     "    cloudpickle.register_pickle_by_value(sys.modules[ComputeWorldSizeActor.__module__])\n",
     "\n",
     "    await main()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71ca61d9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {