Skip to content

Commit 41474c5

Browse files
Merge pull request #83 from oracle-quickstart/update-rccl-tests
Update RCCL tests
2 parents 3e2da79 + caa1a11 commit 41474c5

File tree

4 files changed

+254
-63
lines changed

4 files changed

+254
-63
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ helm install kueue oci://registry.k8s.io/kueue/charts/kueue --version="0.14.1" -
161161
### Run the NCCL/RCCL Tests
162162

163163
> [!IMPORTANT]
164-
> The NCCL parameters differ between GPU shapes. Ensure that you use the correct manifest for your specific bare metal GPU shape.
164+
> The NCCL/RCCL parameters differ between GPU shapes. Ensure that you use the correct manifest for your specific bare metal GPU shape.
165165
166166
#### BM.GPU.GB200-v2.4
167167
```sh
@@ -198,6 +198,11 @@ kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke
198198
kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/nccl-tests/kueue/BM.GPU.B4.8.yaml
199199
```
200200

201+
#### BM.GPU.MI300X.8
202+
```sh
203+
kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/rccl-tests/kueue/BM.GPU.MI300X.8.yaml
204+
```
205+
201206
The initial container image pull may take some time. Once the launcher pod `nccl-test-launcher-XXXXX` starts running, you can check its logs for the NCCL test results.
202207

203208
### Example Output

docker/rccl-tests/Dockerfile

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,29 @@
1-
## base docker image
1+
## Base docker image
22
ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04
3-
ARG ROCM_IMAGE_TAG=6.3.2
3+
ARG ROCM_IMAGE_TAG=7.0.2
44
FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}"
55

6-
## rccl repo
7-
ARG RCCL_REPO=https://github.com/ROCm/rccl
8-
ARG RCCL_BRANCH=develop
6+
## Re-declare to use in build stage (inherits value from above)
7+
ARG ROCM_IMAGE_TAG
98

10-
## rccl-tests repo
9+
## RCCL repo
10+
ARG RCCL_VERSION=rocm-${ROCM_IMAGE_TAG}
11+
12+
## RCCL tests repo
1113
ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests
1214
ARG RCCL_TESTS_BRANCH=develop
1315

14-
## Mellanox OFED version
15-
ARG MELLANOX_OFED_VERSION
16+
## AMD GPU Targets
17+
ARG GPU_TARGETS=gfx942
1618

17-
## creating scratch space
18-
RUN mkdir -p /workspace
19-
WORKDIR /workspace
19+
ENV WORKDIR=/workspace
20+
RUN mkdir -p ${WORKDIR}
21+
WORKDIR ${WORKDIR}
2022

21-
## install dependencies
2223
RUN apt-get update \
2324
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
2425
ca-certificates \
2526
git \
26-
openssh-server \
27-
iputils-ping \
28-
net-tools \
2927
make \
3028
rocm-cmake \
3129
ninja-build \
@@ -60,65 +58,76 @@ RUN apt-get update \
6058
python3-dev \
6159
python3-tk \
6260
python3-yaml \
63-
wget \
61+
vim \
62+
less \
63+
openssh-client \
64+
openssh-server \
6465
&& \
6566
apt-get clean && \
6667
rm -rf /var/lib/apt/lists/*
6768

68-
# Mellanox OFED
69-
RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add -
70-
RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${MELLANOX_OFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list
71-
72-
RUN apt-get -qq update \
73-
&& apt-get -qq install -y --no-install-recommends \
74-
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
75-
&& rm -rf /var/lib/apt/lists/*
76-
7769
RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \
7870
&& chmod +x cmake-3.28.0-linux-x86_64.sh \
7971
&& bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \
8072
&& rm cmake-3.28.0-linux-x86_64.sh
8173

74+
# Mellanox OFED
75+
RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add -
76+
RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${MELLANOX_OFED_VERSION:-latest}/ubuntu22.04/mellanox_mlnx_ofed.list
77+
RUN apt-get -qq update \
78+
&& apt-get -qq install -y --no-install-recommends \
79+
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
80+
&& rm -rf /var/lib/apt/lists/*
81+
82+
## Set ROCm path
83+
ENV ROCM_PATH=/opt/rocm
84+
8285
## Install UCX
8386
ENV UCX_INSTALL_PREFIX=/opt/ucx
84-
RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \
87+
RUN wget https://github.com/openucx/ucx/releases/download/v1.19.0/ucx-1.19.0.tar.gz \
8588
&& mkdir -p ucx \
86-
&& tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \
89+
&& tar -zxf ucx-1.19.0.tar.gz -C ucx --strip-components=1 \
8790
&& cd ucx \
8891
&& mkdir build \
8992
&& cd build \
90-
&& ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=/opt/rocm \
93+
&& ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \
9194
&& make -j$(nproc) install \
9295
&& cd ../.. \
93-
&& rm -rf ucx ucx-1.16.0.tar.gz
96+
&& rm -rf ucx ucx-1.19.0.tar.gz
9497

9598
## Install OpenMPI
9699
ENV MPI_INSTALL_PREFIX=/opt/ompi
97-
RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \
100+
RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.8.tar.gz \
98101
&& mkdir -p ompi4 \
99-
&& tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \
102+
&& tar -zxf openmpi-4.1.8.tar.gz -C ompi4 --strip-components=1 \
100103
&& cd ompi4 \
101104
&& mkdir build \
102105
&& cd build \
103106
&& ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \
104107
&& make -j$(nproc) install \
105108
&& cd ../.. \
106-
&& rm -rf ompi4 openmpi-4.1.6.tar.gz
109+
&& rm -rf ompi4 openmpi-4.1.8.tar.gz
107110

108-
## building RCCL
109-
ENV RCCL_INSTALL_PREFIX=/opt/rocm
110-
RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl \
111+
## Build RCCL
112+
ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl/install
113+
RUN git clone --recurse-submodules -b ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl \
111114
&& cd ./rccl \
112-
&& ./install.sh -t -j$(nproc) --amdgpu_targets="gfx942" --prefix=${RCCL_INSTALL_PREFIX}
115+
&& ./install.sh --amdgpu_targets=${GPU_TARGETS} --prefix=${RCCL_INSTALL_PREFIX}
113116

114-
## building RCCL-Tests
117+
## Build RCCL-Tests
115118
RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \
116119
&& cd ./rccl-tests \
117-
&& make MPI=1 MPI_HOME=${MPI_INSTALL_PREFIX} NCCL_HOME=${RCCL_INSTALL_PREFIX} -j$(nproc)
120+
&& mkdir build \
121+
&& cd build \
122+
&& cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DCMAKE_PREFIX_PATH="${RCCL_INSTALL_PREFIX};${MPI_INSTALL_PREFIX}" -DGPU_TARGETS=${GPU_TARGETS} .. \
123+
&& make -j$(nproc)
118124

119-
## set environment variables
120-
ENV PATH="${RCCL_INSTALL_PREFIX}/bin:${MPI_INSTALL_PREFIX}/bin:${PATH}"
121-
ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}/lib:${MPI_INSTALL_PREFIX}/lib:${LD_LIBRARY_PATH}"
125+
## Set environment variables
126+
ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}"
127+
ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib"
128+
ENV UCX_WARN_UNUSED_ENV_VARS=n
129+
ENV OMPI_ALLOW_RUN_AS_ROOT=1
130+
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
122131

123132
# Configure SSH
124133
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
apiVersion: kueue.x-k8s.io/v1beta1
2+
kind: ResourceFlavor
3+
metadata:
4+
name: bm-gpu-mi300x-8
5+
spec:
6+
nodeLabels:
7+
node.kubernetes.io/instance-type: BM.GPU.MI300X.8
8+
amd.com/gpu: "true"
9+
---
10+
apiVersion: kueue.x-k8s.io/v1beta1
11+
kind: ClusterQueue
12+
metadata:
13+
name: bm-gpu-mi300x-8-rccl-tests-queue
14+
spec:
15+
namespaceSelector: {}
16+
resourceGroups:
17+
- coveredResources: ["cpu", "memory", "amd.com/gpu", "ephemeral-storage"]
18+
flavors:
19+
- name: bm-gpu-mi300x-8
20+
resources:
21+
- name: cpu
22+
nominalQuota: "5000"
23+
- name: memory
24+
nominalQuota: "204800Gi"
25+
- name: amd.com/gpu
26+
nominalQuota: "1600"
27+
- name: ephemeral-storage
28+
nominalQuota: "12800Gi"
29+
---
30+
apiVersion: kueue.x-k8s.io/v1beta1
31+
kind: LocalQueue
32+
metadata:
33+
name: bm-gpu-mi300x-8-rccl-tests
34+
spec:
35+
clusterQueue: bm-gpu-mi300x-8-rccl-tests-queue
36+
---
37+
apiVersion: kubeflow.org/v2beta1
38+
kind: MPIJob
39+
metadata:
40+
name: rccl-tests
41+
labels:
42+
kueue.x-k8s.io/queue-name: bm-gpu-mi300x-8-rccl-tests
43+
spec:
44+
slotsPerWorker: 8
45+
runPolicy:
46+
cleanPodPolicy: "Running"
47+
sshAuthMountPath: /root/.ssh
48+
mpiReplicaSpecs:
49+
Launcher:
50+
replicas: 1
51+
template:
52+
metadata:
53+
labels:
54+
rccl-tests-replica: mpi-launcher
55+
spec:
56+
hostNetwork: true
57+
dnsPolicy: ClusterFirstWithHostNet
58+
restartPolicy: OnFailure
59+
terminationGracePeriodSeconds: 2
60+
containers:
61+
- name: mpi-launcher
62+
image: iad.ocir.io/idxzjcdglx2s/rccl-tests:rocm-7.0.2-rccl-2.26.6-ubuntu22.04-102025.1
63+
imagePullPolicy: Always
64+
ports:
65+
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
66+
command: ["bash", "-c"]
67+
args:
68+
- |
69+
set -e -o pipefail
70+
trap 'exit=1' SIGINT
71+
NUM_GPUS=8
72+
NUM_HOSTS=$(sed -n '$=' /etc/mpi/hostfile)
73+
NP=$(($NUM_HOSTS*$NUM_GPUS))
74+
75+
while ! (for host in $(awk '{print $1}' /etc/mpi/hostfile); do
76+
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -p 2222 $host exit 2>/dev/null || exit 1
77+
done); do
78+
echo "Waiting for workers to be ready..."
79+
sleep 5
80+
done
81+
echo "All workers are ready!"
82+
83+
mpirun \
84+
-mca coll ^hcoll \
85+
-mca pml ucx \
86+
-mca btl ^openib \
87+
-mca plm_rsh_args '-p 2222' \
88+
--allow-run-as-root \
89+
-np $NP \
90+
-npernode $NUM_GPUS \
91+
--bind-to numa \
92+
-x RX_QUEUE_LEN=8192 \
93+
-x IB_RX_QUEUE_LEN=8192 \
94+
-x UCX_NET_DEVICES=mlx5_0:1 \
95+
-x HCOLL_ENABLE_MCAST_ALL=0 \
96+
-x coll_hcoll_enable=0 \
97+
-x NCCL_CUMEM_ENABLE=0 \
98+
-x NCCL_IB_TIMEOUT=22 \
99+
-x NCCL_IB_SL=0 \
100+
-x NCCL_IB_TC=41 \
101+
-x NCCL_IB_GID_INDEX=3 \
102+
-x NCCL_DEBUG=WARN \
103+
-x NCCL_IB_QPS_PER_CONNECTION=1 \
104+
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
105+
-x NCCL_IB_HCA='=mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9' \
106+
-x NCCL_PXN_DISABLE=0 \
107+
-x NCCL_NET_PLUGIN=none \
108+
-x LD_LIBRARY_PATH \
109+
/workspace/rccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 -n 50
110+
resources:
111+
limits:
112+
ephemeral-storage: 32Gi
113+
requests:
114+
cpu: 2
115+
ephemeral-storage: 32Gi
116+
memory: 2Gi
117+
securityContext:
118+
privileged: true
119+
capabilities:
120+
add: [IPC_LOCK, SYS_PTRACE]
121+
volumeMounts:
122+
- { mountPath: /dev/shm, name: shm }
123+
workingDir: /workspace
124+
volumes:
125+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 128Gi }}
126+
Worker:
127+
replicas: 2
128+
template:
129+
metadata:
130+
labels:
131+
rccl-tests-replica: mpi-worker
132+
spec:
133+
hostNetwork: true
134+
dnsPolicy: ClusterFirstWithHostNet
135+
restartPolicy: OnFailure
136+
terminationGracePeriodSeconds: 2
137+
containers:
138+
- name: mpi-worker
139+
image: iad.ocir.io/idxzjcdglx2s/rccl-tests:rocm-7.0.2-rccl-2.26.6-ubuntu22.04-102025.1
140+
imagePullPolicy: Always
141+
ports:
142+
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
143+
command:
144+
- /bin/bash
145+
- -c
146+
- mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222
147+
resources:
148+
limits:
149+
ephemeral-storage: 32Gi
150+
amd.com/gpu: 8
151+
requests:
152+
cpu: 200
153+
ephemeral-storage: 32Gi
154+
memory: 1024Gi
155+
amd.com/gpu: 8
156+
securityContext:
157+
privileged: true
158+
capabilities:
159+
add: [IPC_LOCK, SYS_PTRACE]
160+
volumeMounts:
161+
- { mountPath: /dev/shm, name: shm }
162+
workingDir: /workspace
163+
tolerations:
164+
- { key: amd.com/gpu, operator: Exists }
165+
volumes:
166+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 128Gi }}
167+

0 commit comments

Comments
 (0)