move runtime cluster config example to separate benchmark/ script

jvmncs · jvmncs · commit 6f1e98307dfe · 2025-12-10T15:12:57.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+pyproject.toml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -35,3 +35,13 @@ The benchmark automatically configures RDMA settings for OCI's infrastructure:
 - Uses IPv4 for data plane (RDMA) communication
 - Configures optimal NCCL parameters for IB/RDMA
 - Sets appropriate HCA device ordering
+
+## Runtime Cluster Configuration
+
+This directory also contains an example of configuring the cluster definition at runtime in [modal_train_runtime_cfg.py](./modal_train_runtime_cfg.py). To run the NCCL bandwidth benchmark on a 2-node 8xB200 cluster:
+
+```bash
+python modal_train_runtime_cfg.py 2 8 --gpu-type B200
+```
+
+The `--gpu-type` parameter can be any of `H100`, `H200`, or `B200`.
diff --git a/benchmark/modal_train.py b/benchmark/modal_train.py
@@ -1,8 +1,4 @@
-import argparse
-import dataclasses
-import enum
 import os
-from typing import Union
 
 import modal
 import modal.experimental
@@ -39,95 +35,39 @@
 app = modal.App("multinode-benchmark")
 
 
-# NB: This cluster config code was ripped out of a project that shared training logic
-# across single and multi node execution configs, hence the validation in __post_init__
-class ModalGPU(enum.StrEnum):
-    H100 = "H100"
-    H200 = "H200"
-    A100_40G = "A100-40G"
-    A100_80G = "A100-80G"
-    B200 = "B200"
-    L40S = "L40S"
-
-
-@dataclasses.dataclass
-class ModalClusterConfig:
-    num_nodes: int
-    gpus_per_node: int
-    gpu_type: Union[str, ModalGPU] = ModalGPU.H100
-
-    def __post_init__(self):
-        if isinstance(self.gpu_type, str):
-            try:
-                self.gpu_type = ModalGPU(self.gpu_type)
-            except ValueError:
-                valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
-                raise ValueError(
-                    f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
-                )
-
-        # @modal.experimental.clustered only supports H100s at the moment
-        if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1:
-            raise ValueError(
-                f"num_nodes must be 1 when using gpu_type {self.gpu_type}. "
-                f"At time of writing, only {ModalGPU.H100} supports multiple nodes."
-            )
-
-    def gpu_str(self):
-        return f"{self.gpu_type}:{self.gpus_per_node}"
-
-
-def build_benchmark(cfg: ModalClusterConfig):
-    @app.function(
-        gpu=cfg.gpu_str(),
-        cloud="oci",
-        image=image,
-        serialized=True,
-    )
-    @modal.experimental.clustered(size=cfg.num_nodes, rdma=True)
-    def run_benchmark():
-        """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
-
-        from torch.distributed.run import parse_args, run
-
-        cluster_info = modal.experimental.get_cluster_info()
-        # which container am I?
-        container_rank: int = cluster_info.rank
-        # what's the leader/master/main container's address?
-        main_ip_addr: str = cluster_info.container_ips[0]
-        container_id = os.environ["MODAL_TASK_ID"]
-
-        print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
-        if container_rank == 0:
-            print(f"main container's address: {main_ip_addr}")
-
-        args = [
-            f"--nnodes={N_NODES}",
-            f"--nproc-per-node={N_PROC_PER_NODE}",
-            f"--node-rank={cluster_info.rank}",
-            f"--master-addr={main_ip_addr}",
-            REMOTE_BENCH_SCRIPT_PATH,
-        ]
-        print(f"Running torchrun with args: {' '.join(args)}")
-        run(parse_args(args))
-
-    return run_benchmark
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run multinode benchmark")
-    parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
-    parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
-    parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
-
-    args = parser.parse_args()
-
-    gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
-    cluster_config = ModalClusterConfig(
-        num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
-    )
-    run_benchmark = build_benchmark(cluster_config)
-
-    with modal.enable_output():
-        with app.run(detach=True):
-            run_benchmark.remote()
+@app.function(
+    gpu="H100:8",
+    cloud="oci",
+    image=image,
+)
+@modal.experimental.clustered(size=N_NODES, rdma=True)
+def run_benchmark():
+    """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
+
+    from torch.distributed.run import parse_args, run
+
+    cluster_info = modal.experimental.get_cluster_info()
+    # which container am I?
+    container_rank: int = cluster_info.rank
+    # what's the leader/master/main container's address?
+    main_ip_addr: str = cluster_info.container_ips[0]
+    container_id = os.environ["MODAL_TASK_ID"]
+
+    print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
+    if container_rank == 0:
+        print(f"main container's address: {main_ip_addr}")
+
+    args = [
+        f"--nnodes={N_NODES}",
+        f"--nproc-per-node={N_PROC_PER_NODE}",
+        f"--node-rank={cluster_info.rank}",
+        f"--master-addr={main_ip_addr}",
+        REMOTE_BENCH_SCRIPT_PATH,
+    ]
+    print(f"Running torchrun with args: {' '.join(args)}")
+    run(parse_args(args))
+
+
+@app.local_entrypoint()
+def main():
+    run_benchmark.remote()
diff --git a/benchmark/modal_train_runtime_cfg.py b/benchmark/modal_train_runtime_cfg.py
@@ -0,0 +1,121 @@
+import argparse
+import dataclasses
+import enum
+import os
+from typing import Union
+
+import modal
+import modal.experimental
+
+cuda_version = "12.9.1"  # should be no greater than host CUDA version
+flavor = "devel"  #  includes full CUDA toolkit
+operating_sys = "ubuntu24.04"
+tag = f"{cuda_version}-{flavor}-{operating_sys}"
+
+LOCAL_CODE_DIR = os.path.dirname(os.path.abspath(__file__))
+REMOTE_CODE_DIR = "/root/"
+REMOTE_BENCH_SCRIPT_PATH = "/root/train.py"
+
+N_NODES = 2
+N_PROC_PER_NODE = 8
+
+image = (
+    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
+    .apt_install(
+        "libibverbs-dev",
+        "libibverbs1",
+    )
+    .uv_pip_install(
+        "torch==2.9.1", "numpy", "importlib-metadata", "nvidia-cudnn-cu12>=9.0.15"
+    )
+    .add_local_dir(
+        LOCAL_CODE_DIR,
+        remote_path=REMOTE_CODE_DIR,
+    )
+)
+
+app = modal.App("multinode-benchmark")
+
+
+class ModalGPU(enum.StrEnum):
+    H100 = "H100"
+    H200 = "H200"
+    B200 = "B200"
+
+
+@dataclasses.dataclass
+class ModalClusterConfig:
+    num_nodes: int
+    gpus_per_node: int
+    gpu_type: Union[str, ModalGPU] = ModalGPU.H100
+
+    def __post_init__(self):
+        if isinstance(self.gpu_type, str):
+            try:
+                self.gpu_type = ModalGPU(self.gpu_type)
+            except ValueError:
+                valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
+                raise ValueError(
+                    f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
+                )
+
+    def gpu_str(self):
+        return f"{self.gpu_type}:{self.gpus_per_node}"
+
+
+def run_benchmark():
+    """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
+
+    from torch.distributed.run import parse_args, run
+
+    cluster_info = modal.experimental.get_cluster_info()
+    # which container am I?
+    container_rank: int = cluster_info.rank
+    # what's the leader/master/main container's address?
+    main_ip_addr: str = cluster_info.container_ips[0]
+    container_id = os.environ["MODAL_TASK_ID"]
+
+    print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
+    if container_rank == 0:
+        print(f"main container's address: {main_ip_addr}")
+
+    args = [
+        f"--nnodes={N_NODES}",
+        f"--nproc-per-node={N_PROC_PER_NODE}",
+        f"--node-rank={cluster_info.rank}",
+        f"--master-addr={main_ip_addr}",
+        REMOTE_BENCH_SCRIPT_PATH,
+    ]
+    print(f"Running torchrun with args: {' '.join(args)}")
+    run(parse_args(args))
+
+
+def build_benchmark(cfg: ModalClusterConfig):
+    # additionally, could assign a different image build for hopper vs. blackwell
+    # or perform other hardware-specific setup/configuration as needed
+
+    wrapped_runner = app.function(
+        gpu=cfg.gpu_str(),
+        image=image,
+    )(modal.experimental.clustered(size=cfg.num_nodes, rdma=True)(run_benchmark))
+
+    return wrapped_runner
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run multinode benchmark")
+    parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
+    parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
+    parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
+
+    args = parser.parse_args()
+
+    gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
+    cluster_config = ModalClusterConfig(
+        num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
+    )
+    run_benchmark = build_benchmark(cluster_config)
+
+    with modal.enable_output():
+        with app.run(detach=True):
+            run_benchmark.remote()

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+pyproject.toml`
	`2`	`+`
`1`	`3`	`# Byte-compiled / optimized / DLL files`
`2`	`4`	`__pycache__/`
`3`	`5`	`*.py[cod]`