Skip to content

Commit 2e73506

Browse files
Merge pull request #52 from oracle-quickstart/rccl-test-update
Update BM.GPU.MI300X.8.yaml
2 parents a34158b + af57cf5 commit 2e73506

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

manifests/rccl-tests/BM.GPU.MI300X.8.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ spec:
3535
--mca btl ^openib \
3636
-x NCCL_DEBUG=VERSION \
3737
-x NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9 \
38-
-x NCCL_SOCKET_IFNAME=eth0 \
38+
-x UCX_NET_DEVICES=eth0 \
3939
-x NCCL_IB_TC=41 \
4040
-x NCCL_IB_SL=0 \
4141
-x NCCL_IB_GID_INDEX=3 \
@@ -47,14 +47,14 @@ spec:
4747
/workspace/rccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
4848
ports:
4949
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
50-
image: iad.ocir.io/hpc_limited_availability/oke/rccl-tests:rocm-6.2.1-ofed-5.9-0.5.6.0.127
50+
image: iad.ocir.io/hpc_limited_availability/oke/rccl-tests:rocm-6.3.2-OFED-24.10-1.1.4.0
5151
imagePullPolicy: Always
5252
name: mpimaster
5353
resources:
5454
limits:
5555
ephemeral-storage: 32Gi
5656
requests:
57-
cpu: 8
57+
cpu: 2
5858
ephemeral-storage: 32Gi
5959
memory: 2Gi
6060
securityContext:
@@ -82,7 +82,7 @@ spec:
8282
- sysctl --system; mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222
8383
ports:
8484
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
85-
image: iad.ocir.io/hpc_limited_availability/oke/rccl-tests:rocm-6.2.1-ofed-5.9-0.5.6.0.127
85+
image: iad.ocir.io/hpc_limited_availability/oke/rccl-tests:rocm-6.3.2-OFED-24.10-1.1.4.0
8686
imagePullPolicy: Always
8787
name: mpiworker
8888
resources:
@@ -108,4 +108,4 @@ spec:
108108
tolerations:
109109
- { key: amd.com/gpu, operator: Exists }
110110
volumes:
111-
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 128Gi }}
111+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 128Gi }}

0 commit comments

Comments
 (0)