Skip to content

Commit a2f1279

Browse files
authored
Test peer accessibility after deployment (#661)
Test GPUs' peer accessibility before integration testing to distinguish VM issues.
1 parent 4d4f087 commit a2f1279

File tree

3 files changed

+74
-0
lines changed

3 files changed

+74
-0
lines changed

test/deploy/setup.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
1313
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
1414
done
1515

16+
make -C /root/mscclpp/tools/peer-access-test
17+
/root/mscclpp/tools/peer-access-test/peer_access_test
18+
make -C /root/mscclpp/tools/peer-access-test clean
19+
1620
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
1721
pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
1822
else

tools/peer-access-test/Makefile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
CUDA_HOME ?= /usr/local/cuda
2+
ROCM_HOME ?= /opt/rocm
3+
4+
# Check if nvcc exists, otherwise use hipcc
5+
ifeq ($(shell which $(CUDA_HOME)/bin/nvcc 2>/dev/null),)
6+
COMPILER := $(ROCM_HOME)/bin/hipcc
7+
ARCH_FLAG := -D__HIP_PLATFORM_AMD__=1
8+
else
9+
COMPILER := $(CUDA_HOME)/bin/nvcc
10+
ARCH_FLAG := -arch=native
11+
endif
12+
13+
TARGET = peer_access_test
14+
SRC = peer_access_test.cu
15+
16+
all: $(TARGET)
17+
18+
$(TARGET): $(SRC)
19+
$(COMPILER) $(ARCH_FLAG) -o $@ $<
20+
21+
clean:
22+
rm -f $(TARGET)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
4+
#if defined(__HIP_PLATFORM_AMD__)
5+
#include <hip/hip_runtime.h>
6+
using cudaError_t = hipError_t;
7+
constexpr auto cudaSuccess = hipSuccess;
8+
#define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__)
9+
#define cudaDeviceCanAccessPeer(...) hipDeviceCanAccessPeer(__VA_ARGS__)
10+
#else
11+
#include <cuda_runtime.h>
12+
#endif
13+
14+
#include <iostream>
15+
16+
#define CUDACHECK(cmd) \
17+
do { \
18+
cudaError_t e = cmd; \
19+
if (e != cudaSuccess) { \
20+
std::cerr << "Failed: " #cmd << " returned " << e << std::endl; \
21+
std::exit(EXIT_FAILURE); \
22+
} \
23+
} while (0)
24+
25+
int main() {
26+
bool canAccessPeerAll = true;
27+
int devCount = 0;
28+
CUDACHECK(cudaGetDeviceCount(&devCount));
29+
std::cout << "Detected " << devCount << " device(s)" << std::endl;
30+
if (devCount >= 2) {
31+
for (int i = 0; i < devCount; ++i) {
32+
for (int j = 0; j < devCount; ++j) {
33+
if (i != j) {
34+
int canAccessPeer = 0;
35+
CUDACHECK(cudaDeviceCanAccessPeer(&canAccessPeer, i, j));
36+
if (!canAccessPeer) {
37+
canAccessPeerAll = false;
38+
std::cerr << "Device " << i << " cannot access peer Device " << j << std::endl;
39+
}
40+
}
41+
}
42+
}
43+
}
44+
if (canAccessPeerAll) {
45+
std::cout << "All devices can access each other" << std::endl;
46+
}
47+
return canAccessPeerAll ? 0 : 1;
48+
}

0 commit comments

Comments
 (0)