Create BM.GPU.H200.8.yaml

OguzPastirmaci · web-flow · commit 237817113a6d · 2025-08-06T22:54:36.000-07:00
diff --git a/manifests/nccl-tests/BM.GPU.H200.8.yaml b/manifests/nccl-tests/BM.GPU.H200.8.yaml
@@ -0,0 +1,122 @@
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  annotations:
+  name: nccl-allreduce-job0
+spec:
+  minAvailable: 0
+  plugins:
+    ssh: []
+    svc: []
+  queue: default
+  schedulerName: volcano
+  tasks:
+  - name: mpimaster
+    policies:
+    - action: CompleteJob
+      event: TaskCompleted
+    replicas: 1
+    template:
+      metadata:
+      spec:
+        containers:
+        - command:
+          - /bin/bash
+          - -c
+          - |
+            set -e -o pipefail; trap 'exit=1' SIGINT
+            NUM_GPUS=8
+            NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
+            NP=$(($NUM_HOSTS*$NUM_GPUS))
+            mpirun --allow-run-as-root \
+              -mca coll ^hcoll  -mca plm_rsh_args "-p 2222" \
+              -mca coll_hcoll_enable 0 \
+              -np $NP -npernode $NUM_GPUS --bind-to numa \
+              -hostfile /etc/volcano/mpiworker.host \
+              -x NCCL_DEBUG=WARN \
+              -x NCCL_CUMEM_ENABLE=0 \
+              -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
+              -x NCCL_IB_QPS_PER_CONNECTION=2 \
+              -x NCCL_IB_GID_INDEX=3 \
+              -x NCCL_IB_HCA==mlx5_0,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9,mlx5_10,mlx5_11 \
+              -x NCCL_IB_TC=41 \
+              -x NCCL_IB_SL=0 \
+              -x NCCL_IB_TIMEOUT=16 \
+              -x HCOLL_ENABLE_MCAST_ALL=0 \
+              -x UCX_TLS=tcp \
+              -x UCX_NET_DEVICES=eth0 \
+              -x RX_QUEUE_LEN=8192 \
+              -x IB_RX_QUEUE_LEN=8192 \
+              -x NCCL_SOCKET_IFNAME=eth0 \
+              -x NCCL_IGNORE_CPU_AFFINITY=1 \
+              /workspace/nccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
+            while :; do { [[ $exit ]] && break; }; sleep 1; done
+          ports:
+          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
+          image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-25.06-nccl-2.27.7-1
+          name: mpimaster
+          resources:
+            limits:
+              ephemeral-storage: 16Gi
+            requests:
+              cpu: 4
+              ephemeral-storage: 16Gi
+              memory: 1Gi
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - { mountPath: /dev/infiniband, name: devinf }
+          - { mountPath: /dev/shm, name: shm }
+          workingDir: /workspace
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        restartPolicy: OnFailure
+        terminationGracePeriodSeconds: 2
+        volumes:
+        - { name: devinf, hostPath: { path: /dev/infiniband }}
+        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
+  - minAvailable: 0
+    name: mpiworker
+    replicas: 2
+    template:
+      metadata:
+      spec:
+        containers:
+        - command:
+          - /bin/bash
+          - -c
+          - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999;
+          image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-25.06-nccl-2.27.7-1
+          name: mpiworker
+          ports:
+          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
+          resources:
+            limits:
+              ephemeral-storage: 32Gi
+              nvidia.com/gpu: 8
+            requests:
+              cpu: 100
+              ephemeral-storage: 32Gi
+              memory: 512Gi
+              nvidia.com/gpu: 8
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - { mountPath: /dev/infiniband, name: devinf }
+          - { mountPath: /dev/shm, name: shm }
+          workingDir: /workspace
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        restartPolicy: OnFailure
+        terminationGracePeriodSeconds: 15
+        tolerations:
+        - { key: nvidia.com/gpu, operator: Exists }
+        volumes:
+        - { name: devinf, hostPath: { path: /dev/infiniband }}
+        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}