Skip to content

Commit 6f1e983

Browse files
committed
move runtime cluster config example to separate benchmark/ script
1 parent a61f9d2 commit 6f1e983

File tree

4 files changed

+169
-96
lines changed

4 files changed

+169
-96
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
pyproject.toml
2+
13
# Byte-compiled / optimized / DLL files
24
__pycache__/
35
*.py[cod]

benchmark/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,13 @@ The benchmark automatically configures RDMA settings for OCI's infrastructure:
3535
- Uses IPv4 for data plane (RDMA) communication
3636
- Configures optimal NCCL parameters for IB/RDMA
3737
- Sets appropriate HCA device ordering
38+
39+
## Runtime Cluster Configuration
40+
41+
This directory also contains an example of configuring the cluster definition at runtime in [modal_train_runtime_cfg.py](./modal_train_runtime_cfg.py). To run the NCCL bandwidth benchmark on a 2-node 8xB200 cluster:
42+
43+
```bash
44+
python modal_train_runtime_cfg.py 2 8 --gpu-type B200
45+
```
46+
47+
The `--gpu-type` parameter can be any of `H100`, `H200`, or `B200`.

benchmark/modal_train.py

Lines changed: 36 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
import argparse
2-
import dataclasses
3-
import enum
41
import os
5-
from typing import Union
62

73
import modal
84
import modal.experimental
@@ -39,95 +35,39 @@
3935
app = modal.App("multinode-benchmark")
4036

4137

42-
# NB: This cluster config code was ripped out of a project that shared training logic
43-
# across single and multi node execution configs, hence the validation in __post_init__
44-
class ModalGPU(enum.StrEnum):
45-
H100 = "H100"
46-
H200 = "H200"
47-
A100_40G = "A100-40G"
48-
A100_80G = "A100-80G"
49-
B200 = "B200"
50-
L40S = "L40S"
51-
52-
53-
@dataclasses.dataclass
54-
class ModalClusterConfig:
55-
num_nodes: int
56-
gpus_per_node: int
57-
gpu_type: Union[str, ModalGPU] = ModalGPU.H100
58-
59-
def __post_init__(self):
60-
if isinstance(self.gpu_type, str):
61-
try:
62-
self.gpu_type = ModalGPU(self.gpu_type)
63-
except ValueError:
64-
valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
65-
raise ValueError(
66-
f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
67-
)
68-
69-
# @modal.experimental.clustered only supports H100s at the moment
70-
if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1:
71-
raise ValueError(
72-
f"num_nodes must be 1 when using gpu_type {self.gpu_type}. "
73-
f"At time of writing, only {ModalGPU.H100} supports multiple nodes."
74-
)
75-
76-
def gpu_str(self):
77-
return f"{self.gpu_type}:{self.gpus_per_node}"
78-
79-
80-
def build_benchmark(cfg: ModalClusterConfig):
81-
@app.function(
82-
gpu=cfg.gpu_str(),
83-
cloud="oci",
84-
image=image,
85-
serialized=True,
86-
)
87-
@modal.experimental.clustered(size=cfg.num_nodes, rdma=True)
88-
def run_benchmark():
89-
"""Run a simple benchmark script that passes around a tensor of size 500000x2000."""
90-
91-
from torch.distributed.run import parse_args, run
92-
93-
cluster_info = modal.experimental.get_cluster_info()
94-
# which container am I?
95-
container_rank: int = cluster_info.rank
96-
# what's the leader/master/main container's address?
97-
main_ip_addr: str = cluster_info.container_ips[0]
98-
container_id = os.environ["MODAL_TASK_ID"]
99-
100-
print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
101-
if container_rank == 0:
102-
print(f"main container's address: {main_ip_addr}")
103-
104-
args = [
105-
f"--nnodes={N_NODES}",
106-
f"--nproc-per-node={N_PROC_PER_NODE}",
107-
f"--node-rank={cluster_info.rank}",
108-
f"--master-addr={main_ip_addr}",
109-
REMOTE_BENCH_SCRIPT_PATH,
110-
]
111-
print(f"Running torchrun with args: {' '.join(args)}")
112-
run(parse_args(args))
113-
114-
return run_benchmark
115-
116-
117-
if __name__ == "__main__":
118-
parser = argparse.ArgumentParser(description="Run multinode benchmark")
119-
parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
120-
parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
121-
parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
122-
123-
args = parser.parse_args()
124-
125-
gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
126-
cluster_config = ModalClusterConfig(
127-
num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
128-
)
129-
run_benchmark = build_benchmark(cluster_config)
130-
131-
with modal.enable_output():
132-
with app.run(detach=True):
133-
run_benchmark.remote()
38+
@app.function(
39+
gpu="H100:8",
40+
cloud="oci",
41+
image=image,
42+
)
43+
@modal.experimental.clustered(size=N_NODES, rdma=True)
44+
def run_benchmark():
45+
"""Run a simple benchmark script that passes around a tensor of size 500000x2000."""
46+
47+
from torch.distributed.run import parse_args, run
48+
49+
cluster_info = modal.experimental.get_cluster_info()
50+
# which container am I?
51+
container_rank: int = cluster_info.rank
52+
# what's the leader/master/main container's address?
53+
main_ip_addr: str = cluster_info.container_ips[0]
54+
container_id = os.environ["MODAL_TASK_ID"]
55+
56+
print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
57+
if container_rank == 0:
58+
print(f"main container's address: {main_ip_addr}")
59+
60+
args = [
61+
f"--nnodes={N_NODES}",
62+
f"--nproc-per-node={N_PROC_PER_NODE}",
63+
f"--node-rank={cluster_info.rank}",
64+
f"--master-addr={main_ip_addr}",
65+
REMOTE_BENCH_SCRIPT_PATH,
66+
]
67+
print(f"Running torchrun with args: {' '.join(args)}")
68+
run(parse_args(args))
69+
70+
71+
@app.local_entrypoint()
72+
def main():
73+
run_benchmark.remote()
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import argparse
2+
import dataclasses
3+
import enum
4+
import os
5+
from typing import Union
6+
7+
import modal
8+
import modal.experimental
9+
10+
cuda_version = "12.9.1" # should be no greater than host CUDA version
11+
flavor = "devel" # includes full CUDA toolkit
12+
operating_sys = "ubuntu24.04"
13+
tag = f"{cuda_version}-{flavor}-{operating_sys}"
14+
15+
LOCAL_CODE_DIR = os.path.dirname(os.path.abspath(__file__))
16+
REMOTE_CODE_DIR = "/root/"
17+
REMOTE_BENCH_SCRIPT_PATH = "/root/train.py"
18+
19+
N_NODES = 2
20+
N_PROC_PER_NODE = 8
21+
22+
image = (
23+
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
24+
.apt_install(
25+
"libibverbs-dev",
26+
"libibverbs1",
27+
)
28+
.uv_pip_install(
29+
"torch==2.9.1", "numpy", "importlib-metadata", "nvidia-cudnn-cu12>=9.0.15"
30+
)
31+
.add_local_dir(
32+
LOCAL_CODE_DIR,
33+
remote_path=REMOTE_CODE_DIR,
34+
)
35+
)
36+
37+
app = modal.App("multinode-benchmark")
38+
39+
40+
class ModalGPU(enum.StrEnum):
41+
H100 = "H100"
42+
H200 = "H200"
43+
B200 = "B200"
44+
45+
46+
@dataclasses.dataclass
47+
class ModalClusterConfig:
48+
num_nodes: int
49+
gpus_per_node: int
50+
gpu_type: Union[str, ModalGPU] = ModalGPU.H100
51+
52+
def __post_init__(self):
53+
if isinstance(self.gpu_type, str):
54+
try:
55+
self.gpu_type = ModalGPU(self.gpu_type)
56+
except ValueError:
57+
valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
58+
raise ValueError(
59+
f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
60+
)
61+
62+
def gpu_str(self):
63+
return f"{self.gpu_type}:{self.gpus_per_node}"
64+
65+
66+
def run_benchmark():
67+
"""Run a simple benchmark script that passes around a tensor of size 500000x2000."""
68+
69+
from torch.distributed.run import parse_args, run
70+
71+
cluster_info = modal.experimental.get_cluster_info()
72+
# which container am I?
73+
container_rank: int = cluster_info.rank
74+
# what's the leader/master/main container's address?
75+
main_ip_addr: str = cluster_info.container_ips[0]
76+
container_id = os.environ["MODAL_TASK_ID"]
77+
78+
print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
79+
if container_rank == 0:
80+
print(f"main container's address: {main_ip_addr}")
81+
82+
args = [
83+
f"--nnodes={N_NODES}",
84+
f"--nproc-per-node={N_PROC_PER_NODE}",
85+
f"--node-rank={cluster_info.rank}",
86+
f"--master-addr={main_ip_addr}",
87+
REMOTE_BENCH_SCRIPT_PATH,
88+
]
89+
print(f"Running torchrun with args: {' '.join(args)}")
90+
run(parse_args(args))
91+
92+
93+
def build_benchmark(cfg: ModalClusterConfig):
94+
# additionally, could assign a different image build for hopper vs. blackwell
95+
# or perform other hardware-specific setup/configuration as needed
96+
97+
wrapped_runner = app.function(
98+
gpu=cfg.gpu_str(),
99+
image=image,
100+
)(modal.experimental.clustered(size=cfg.num_nodes, rdma=True)(run_benchmark))
101+
102+
return wrapped_runner
103+
104+
105+
if __name__ == "__main__":
106+
parser = argparse.ArgumentParser(description="Run multinode benchmark")
107+
parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
108+
parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
109+
parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
110+
111+
args = parser.parse_args()
112+
113+
gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
114+
cluster_config = ModalClusterConfig(
115+
num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
116+
)
117+
run_benchmark = build_benchmark(cluster_config)
118+
119+
with modal.enable_output():
120+
with app.run(detach=True):
121+
run_benchmark.remote()

0 commit comments

Comments
 (0)