Skip to content

Commit a61f9d2

Browse files
committed
demonstrate runtime cluster config in benchmark/
1 parent 14beb9b commit a61f9d2

File tree

1 file changed

+96
-36
lines changed

1 file changed

+96
-36
lines changed

benchmark/modal_train.py

Lines changed: 96 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1+
import argparse
2+
import dataclasses
3+
import enum
14
import os
5+
from typing import Union
26

37
import modal
48
import modal.experimental
@@ -35,39 +39,95 @@
3539
app = modal.App("multinode-benchmark")
3640

3741

38-
@app.function(
39-
gpu="H100:8",
40-
cloud="oci",
41-
image=image,
42-
)
43-
@modal.experimental.clustered(size=N_NODES, rdma=True)
44-
def run_benchmark():
45-
"""Run a simple benchmark script that passes around a tensor of size 500000x2000."""
46-
47-
from torch.distributed.run import parse_args, run
48-
49-
cluster_info = modal.experimental.get_cluster_info()
50-
# which container am I?
51-
container_rank: int = cluster_info.rank
52-
# what's the leader/master/main container's address?
53-
main_ip_addr: str = cluster_info.container_ips[0]
54-
container_id = os.environ["MODAL_TASK_ID"]
55-
56-
print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
57-
if container_rank == 0:
58-
print(f"main container's address: {main_ip_addr}")
59-
60-
args = [
61-
f"--nnodes={N_NODES}",
62-
f"--nproc-per-node={N_PROC_PER_NODE}",
63-
f"--node-rank={cluster_info.rank}",
64-
f"--master-addr={main_ip_addr}",
65-
REMOTE_BENCH_SCRIPT_PATH,
66-
]
67-
print(f"Running torchrun with args: {' '.join(args)}")
68-
run(parse_args(args))
69-
70-
71-
@app.local_entrypoint()
72-
def main():
73-
run_benchmark.remote()
42+
# NB: This cluster config code was ripped out of a project that shared training logic
43+
# across single and multi node execution configs, hence the validation in __post_init__
44+
class ModalGPU(enum.StrEnum):
45+
H100 = "H100"
46+
H200 = "H200"
47+
A100_40G = "A100-40G"
48+
A100_80G = "A100-80G"
49+
B200 = "B200"
50+
L40S = "L40S"
51+
52+
53+
@dataclasses.dataclass
54+
class ModalClusterConfig:
55+
num_nodes: int
56+
gpus_per_node: int
57+
gpu_type: Union[str, ModalGPU] = ModalGPU.H100
58+
59+
def __post_init__(self):
60+
if isinstance(self.gpu_type, str):
61+
try:
62+
self.gpu_type = ModalGPU(self.gpu_type)
63+
except ValueError:
64+
valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
65+
raise ValueError(
66+
f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
67+
)
68+
69+
# @modal.experimental.clustered only supports H100s at the moment
70+
if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1:
71+
raise ValueError(
72+
f"num_nodes must be 1 when using gpu_type {self.gpu_type}. "
73+
f"At time of writing, only {ModalGPU.H100} supports multiple nodes."
74+
)
75+
76+
def gpu_str(self):
77+
return f"{self.gpu_type}:{self.gpus_per_node}"
78+
79+
80+
def build_benchmark(cfg: ModalClusterConfig):
81+
@app.function(
82+
gpu=cfg.gpu_str(),
83+
cloud="oci",
84+
image=image,
85+
serialized=True,
86+
)
87+
@modal.experimental.clustered(size=cfg.num_nodes, rdma=True)
88+
def run_benchmark():
89+
"""Run a simple benchmark script that passes around a tensor of size 500000x2000."""
90+
91+
from torch.distributed.run import parse_args, run
92+
93+
cluster_info = modal.experimental.get_cluster_info()
94+
# which container am I?
95+
container_rank: int = cluster_info.rank
96+
# what's the leader/master/main container's address?
97+
main_ip_addr: str = cluster_info.container_ips[0]
98+
container_id = os.environ["MODAL_TASK_ID"]
99+
100+
print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
101+
if container_rank == 0:
102+
print(f"main container's address: {main_ip_addr}")
103+
104+
args = [
105+
f"--nnodes={N_NODES}",
106+
f"--nproc-per-node={N_PROC_PER_NODE}",
107+
f"--node-rank={cluster_info.rank}",
108+
f"--master-addr={main_ip_addr}",
109+
REMOTE_BENCH_SCRIPT_PATH,
110+
]
111+
print(f"Running torchrun with args: {' '.join(args)}")
112+
run(parse_args(args))
113+
114+
return run_benchmark
115+
116+
117+
if __name__ == "__main__":
118+
parser = argparse.ArgumentParser(description="Run multinode benchmark")
119+
parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
120+
parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
121+
parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
122+
123+
args = parser.parse_args()
124+
125+
gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
126+
cluster_config = ModalClusterConfig(
127+
num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
128+
)
129+
run_benchmark = build_benchmark(cluster_config)
130+
131+
with modal.enable_output():
132+
with app.run(detach=True):
133+
run_benchmark.remote()

0 commit comments

Comments
 (0)