Skip to content

Commit 2378171

Browse files
Create BM.GPU.H200.8.yaml
1 parent 9cd5df3 commit 2378171

File tree

1 file changed

+122
-0
lines changed

1 file changed

+122
-0
lines changed
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
annotations:
5+
name: nccl-allreduce-job0
6+
spec:
7+
minAvailable: 0
8+
plugins:
9+
ssh: []
10+
svc: []
11+
queue: default
12+
schedulerName: volcano
13+
tasks:
14+
- name: mpimaster
15+
policies:
16+
- action: CompleteJob
17+
event: TaskCompleted
18+
replicas: 1
19+
template:
20+
metadata:
21+
spec:
22+
containers:
23+
- command:
24+
- /bin/bash
25+
- -c
26+
- |
27+
set -e -o pipefail; trap 'exit=1' SIGINT
28+
NUM_GPUS=8
29+
NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
30+
NP=$(($NUM_HOSTS*$NUM_GPUS))
31+
mpirun --allow-run-as-root \
32+
-mca coll ^hcoll -mca plm_rsh_args "-p 2222" \
33+
-mca coll_hcoll_enable 0 \
34+
-np $NP -npernode $NUM_GPUS --bind-to numa \
35+
-hostfile /etc/volcano/mpiworker.host \
36+
-x NCCL_DEBUG=WARN \
37+
-x NCCL_CUMEM_ENABLE=0 \
38+
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
39+
-x NCCL_IB_QPS_PER_CONNECTION=2 \
40+
-x NCCL_IB_GID_INDEX=3 \
41+
-x NCCL_IB_HCA==mlx5_0,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9,mlx5_10,mlx5_11 \
42+
-x NCCL_IB_TC=41 \
43+
-x NCCL_IB_SL=0 \
44+
-x NCCL_IB_TIMEOUT=16 \
45+
-x HCOLL_ENABLE_MCAST_ALL=0 \
46+
-x UCX_TLS=tcp \
47+
-x UCX_NET_DEVICES=eth0 \
48+
-x RX_QUEUE_LEN=8192 \
49+
-x IB_RX_QUEUE_LEN=8192 \
50+
-x NCCL_SOCKET_IFNAME=eth0 \
51+
-x NCCL_IGNORE_CPU_AFFINITY=1 \
52+
/workspace/nccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
53+
while :; do { [[ $exit ]] && break; }; sleep 1; done
54+
ports:
55+
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
56+
image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-25.06-nccl-2.27.7-1
57+
name: mpimaster
58+
resources:
59+
limits:
60+
ephemeral-storage: 16Gi
61+
requests:
62+
cpu: 4
63+
ephemeral-storage: 16Gi
64+
memory: 1Gi
65+
securityContext:
66+
privileged: true
67+
capabilities:
68+
add:
69+
- IPC_LOCK
70+
volumeMounts:
71+
- { mountPath: /dev/infiniband, name: devinf }
72+
- { mountPath: /dev/shm, name: shm }
73+
workingDir: /workspace
74+
dnsPolicy: ClusterFirstWithHostNet
75+
hostNetwork: true
76+
restartPolicy: OnFailure
77+
terminationGracePeriodSeconds: 2
78+
volumes:
79+
- { name: devinf, hostPath: { path: /dev/infiniband }}
80+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
81+
- minAvailable: 0
82+
name: mpiworker
83+
replicas: 2
84+
template:
85+
metadata:
86+
spec:
87+
containers:
88+
- command:
89+
- /bin/bash
90+
- -c
91+
- mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999;
92+
image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-25.06-nccl-2.27.7-1
93+
name: mpiworker
94+
ports:
95+
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
96+
resources:
97+
limits:
98+
ephemeral-storage: 32Gi
99+
nvidia.com/gpu: 8
100+
requests:
101+
cpu: 100
102+
ephemeral-storage: 32Gi
103+
memory: 512Gi
104+
nvidia.com/gpu: 8
105+
securityContext:
106+
privileged: true
107+
capabilities:
108+
add:
109+
- IPC_LOCK
110+
volumeMounts:
111+
- { mountPath: /dev/infiniband, name: devinf }
112+
- { mountPath: /dev/shm, name: shm }
113+
workingDir: /workspace
114+
dnsPolicy: ClusterFirstWithHostNet
115+
hostNetwork: true
116+
restartPolicy: OnFailure
117+
terminationGracePeriodSeconds: 15
118+
tolerations:
119+
- { key: nvidia.com/gpu, operator: Exists }
120+
volumes:
121+
- { name: devinf, hostPath: { path: /dev/infiniband }}
122+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}

0 commit comments

Comments
 (0)