Skip to content
Open
151 changes: 151 additions & 0 deletions .azure-pipelines/templates/ut-size-alignment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# .azure-pipelines/templates/ut-size-alignment.yaml
# ----------------------------------------
# A step-template that runs unit tests to track the input size alignment using PyTorch and MSCCLPP.
#
# Parameters:
# subscription – Azure subscription to use for VMSS start/stop
# vmssName – Name of the VMSS to use
# sshKeySecureFile – the secureFile name for your SSH key
# pytorchImage – PyTorch Docker image to use for unit tests

parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: pytorchImage
type: string
default: "mcr.microsoft.com/mirror/nvcr/nvidia/pytorch:25.03-py3"

steps:
- task: Bash@3
name: Build
displayName: Build MSCCLPP
inputs:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}

- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp

- task: Bash@3
name: DeployBenchmarkEnv
displayName: Deploy Benchmark Environment
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}

# Get the host IP
HOST_IP=$(head -1 ${HOSTFILE} | awk '{print $1}')

# Pull the PyTorch image on the remote machine
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST_IP} "sudo docker pull ${{ parameters.pytorchImage }}"

# Stop any existing container
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST_IP} "sudo docker rm -f mscclpp-benchmark || true"

# Copy MSCCLPP build to the remote machine
scp -i ${KeyFilePath} -o ${SSH_OPTION} -r ${ROOT_DIR}/build ${HOST_IP}:/tmp/mscclpp-build
scp -i ${KeyFilePath} -o ${SSH_OPTION} -r ${ROOT_DIR}/test ${HOST_IP}:/tmp/mscclpp-test

# Start the PyTorch container with MSCCLPP mounted
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST_IP} "sudo docker run -d --name mscclpp-benchmark \
--gpus all \
--ipc=host \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-v /tmp/mscclpp-build:/root/mscclpp/build \
-v /tmp/mscclpp-test:/root/mscclpp/test \
${{ parameters.pytorchImage }} \
sleep infinity"
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: AllReduceBenchmarkMultipleSizes
displayName: Run AllReduce Benchmark Test (Multiple Sizes to test the alignment)
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
HOST_IP=$(head -1 ${HOSTFILE} | awk '{print $1}')

: > azureuser@${HOST_IP}
tail -f azureuser@${HOST_IP} &
CHILD_PID=$!

# Run with different element counts
for nelem in 10556576 10556587 10556592 10556608 1048576 9999999 12345678; do
echo "Running AllReduce benchmark with nelem=${nelem}"
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST_IP} "sudo docker exec -t mscclpp-benchmark bash -c \" \
set -e; \
cd /root/mscclpp; \
LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so torchrun --nproc_per_node=8 test/torch/correctness_test.py --collective allreduce --nelem ${nelem} --dtype float\""
done

kill $CHILD_PID || true
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: Cleanup
displayName: Cleanup Benchmark Container
condition: always()
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
HOST_IP=$(head -1 ${HOSTFILE} | awk '{print $1}')

ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST_IP} "sudo docker rm -f mscclpp-benchmark || true"
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST_IP} "sudo rm -rf /tmp/mscclpp-build /tmp/mscclpp-test || true"
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
42 changes: 42 additions & 0 deletions .azure-pipelines/ut-size-alignment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'

pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'

jobs:
- job: BenchmarkTestH100
displayName: Benchmark Test H100
pool:
name: msccl-ci-h100
container:
image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4

steps:
- template: templates/ut-size-alignment.yaml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
pytorchImage: mcr.microsoft.com/mirror/nvcr/nvidia/pytorch:25.03-py3
39 changes: 35 additions & 4 deletions apps/nccl/src/allreduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,8 +963,20 @@ __global__ void __launch_bounds__(1024, 1)
int nBlocksForReduce = nRanksPerNode;
int copyReduceRatio = nBlocksForCopy / nBlocksForReduce;
size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode;
size_t sizePerRank = size / nRanksPerNode;
assert(sizePerRank % alignment == 0);

// Pad size to be divisible by (nRanksPerNode * alignment)
size_t paddingNeeded =
(nRanksPerNode * alignment - (size % (nRanksPerNode * alignment))) % (nRanksPerNode * alignment);
size_t paddedSize = size + paddingNeeded;
size_t sizePerRank = paddedSize / nRanksPerNode;

// Calculate actual size this rank should process (without padding)
size_t actualSizeThisRank = sizePerRank;
if (rank == nRanksPerNode - 1) {
// Last rank might have less actual data due to padding
actualSizeThisRank = size - (sizePerRank * (nRanksPerNode - 1));
}

uint32_t sizePerBlock =
((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment;
uint32_t lastBlockSize = sizePerRank - (nBlocksForCopy - 1) * sizePerBlock;
Expand Down Expand Up @@ -1008,7 +1020,17 @@ __global__ void __launch_bounds__(1024, 1)
uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank;
char* srcData = (char*)src + blockOffset;
char* dstData = (char*)scratch + scratchOffset;
mscclpp::copy(dstData, srcData, iterSize, tid, blockDim.x);
// Calculate actual copy size - don't copy beyond actual data on last rank
size_t actualCopySize = iterSize;
if (i == nRanksPerNode - 1 && blockOffset + iterSize > i * sizePerRank + actualSizeThisRank) {
// On last rank, clamp to actual data size
actualCopySize = (i * sizePerRank + actualSizeThisRank > blockOffset)
? (i * sizePerRank + actualSizeThisRank - blockOffset)
: 0;
}
if (actualCopySize > 0) {
mscclpp::copy(dstData, srcData, actualCopySize, tid, blockDim.x);
}
}
__syncthreads();
if (tid < nPeers) {
Expand Down Expand Up @@ -1067,7 +1089,16 @@ __global__ void __launch_bounds__(1024, 1)
i * scratchSizePerRank;
char* srcData = (char*)scratch + scratchOffset;
char* dstData = (char*)dst + blockOffset;
mscclpp::copy(dstData, srcData, iterSize, tid, blockDim.x);

size_t actualCopySize = iterSize;
if (i == nRanksPerNode - 1 && blockOffset + iterSize > i * sizePerRank + actualSizeThisRank) {
actualCopySize = (i * sizePerRank + actualSizeThisRank > blockOffset)
? (i * sizePerRank + actualSizeThisRank - blockOffset)
: 0;
}
if (actualCopySize > 0) {
mscclpp::copy(dstData, srcData, actualCopySize, tid, blockDim.x);
}
}
__syncthreads();
if (tid == 0) {
Expand Down
4 changes: 3 additions & 1 deletion test/torch/correctness_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def _init_dist():
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ.get("LOCAL_RANK", os.environ["RANK"]))
dist.init_process_group(backend=backend, rank=rank, world_size=world_size, device_id=local_rank)
dist.init_process_group(
backend=backend, rank=rank, world_size=world_size, device_id=torch.device(f"cuda:{local_rank}")
)
torch.cuda.set_device(local_rank)


Expand Down