Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .azure-pipelines/integration-test-rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo /usr/local/mpi/bin/mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand All @@ -90,7 +90,7 @@ jobs:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo /usr/local/mpi/bin/mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
version: [ 'cuda11.8', 'cuda12.2' ]
version: [ 'cuda11.8', 'cuda12.8' ]

steps:
- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mscclpp-lang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ 'cuda11.8', 'cuda12.2' ]
version: [ 'cuda11.8', 'cuda12.8' ]

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN apt-get update && \
lcov \
vim \
&& \
apt-get autoremove && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

Expand Down
43 changes: 18 additions & 25 deletions docker/base-x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,41 +25,34 @@ RUN apt-get update && \
python3-setuptools \
python3-wheel \
sudo \
wget \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
wget

# Install OFED
ARG OFED_VERSION=5.2-2.2.3.0
RUN cd /tmp && \
ARCH=$(uname -m) && \
OS_VERSION=$(lsb_release -rs) && \
OS_VERSION=ubuntu${OS_VERSION} && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install OpenMPI
ENV OPENMPI_VERSION=4.1.5
RUN cd /tmp && \
export ompi_v_parsed="$(echo ${OPENMPI_VERSION} | sed -E 's/^([0-9]+)\.([0-9]+)\..*/\1.\2/')" && \
wget -q https://download.open-mpi.org/release/open-mpi/v${ompi_v_parsed}/openmpi-${OPENMPI_VERSION}.tar.gz && \
tar xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/mpi && \
make -j && \
make install && \
cd .. && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
# Install OpenMPI (should be done after the OFED installation) & clean apt cache
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libopenmpi-dev \
&& \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64
ENV PATH="/usr/local/mpi/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"
# OpenMPI short link (for compatibility with old images)
RUN ln -s /usr/lib/x86_64-linux-gnu/openmpi /usr/local/mpi

RUN echo PATH="${PATH}" > /etc/environment && \
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
ARG EXTRA_LD_PATH=
ENV LD_LIBRARY_PATH="${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"
RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment

ENTRYPOINT []
WORKDIR /
6 changes: 5 additions & 1 deletion docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ baseImageTable=(
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
)

Expand All @@ -23,13 +24,14 @@ extraLdPathTable=(
declare -A ofedVersionTable
ofedVersionTable=(
["cuda12.4"]="23.07-0.5.1.2"
["cuda12.8"]="24.10-1.1.4.0"
)

GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}

print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|rocm6.2]"
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|rocm6.2]"
}

if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
Expand Down Expand Up @@ -64,9 +66,11 @@ if [[ ${TARGET} == rocm* ]]; then
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg ARCH="gfx942" .
docker rmi ${GHCR}-common:base-${TARGET}
else
echo "Building CUDA base image..."
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
docker rmi --no-prune ${GHCR}-common:base-${TARGET}
fi

docker build -t ${GHCR}:base-dev-${TARGET} \
Expand Down
27 changes: 14 additions & 13 deletions test/deploy/run_tests.sh
Original file line number Diff line number Diff line change
@@ -1,45 +1,46 @@
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
export PATH=/usr/local/mpi/bin:$PATH

function run_mscclpp_test()
{
echo "=================Run allgather_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

# For kernel 2, the message size must can be divided by 3
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl

echo "==================Run allreduce_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl

echo "==================Run alltoall_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

Expand All @@ -51,28 +52,28 @@ function run_mscclpp_test()
function run_mp_ut()
{
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
/usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003

echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
/usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
}

function run_pytests()
{
echo "==================Run python tests================================"
/usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
}

function run_py_benchmark()
{
echo "==================Run python benchmark================================"
/usr/local/mpi/bin/mpirun -allow-run-as-root -np 16 --bind-to numa \
mpirun -allow-run-as-root -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
-x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
Expand Down
Loading