Test peer accessibility after deployment (#661)

chhwang · web-flow · commit a2f1279c6092 · 2025-10-24T11:09:36.000-07:00
Test GPUs' peer accessibility before integration testing to distinguish
VM issues.
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
@@ -13,6 +13,10 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
     nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
 done
 
+make -C /root/mscclpp/tools/peer-access-test
+/root/mscclpp/tools/peer-access-test/peer_access_test
+make -C /root/mscclpp/tools/peer-access-test clean
+
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
     pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
 else
diff --git a/tools/peer-access-test/Makefile b/tools/peer-access-test/Makefile
@@ -0,0 +1,22 @@
+CUDA_HOME ?= /usr/local/cuda
+ROCM_HOME ?= /opt/rocm
+
+# Check if nvcc exists, otherwise use hipcc
+ifeq ($(shell which $(CUDA_HOME)/bin/nvcc 2>/dev/null),)
+    COMPILER := $(ROCM_HOME)/bin/hipcc
+    ARCH_FLAG := -D__HIP_PLATFORM_AMD__=1
+else
+    COMPILER := $(CUDA_HOME)/bin/nvcc
+    ARCH_FLAG := -arch=native
+endif
+
+TARGET = peer_access_test
+SRC = peer_access_test.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $<
+
+clean:
+	rm -f $(TARGET)
diff --git a/tools/peer-access-test/peer_access_test.cu b/tools/peer-access-test/peer_access_test.cu
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_runtime.h>
+using cudaError_t = hipError_t;
+constexpr auto cudaSuccess = hipSuccess;
+#define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__)
+#define cudaDeviceCanAccessPeer(...) hipDeviceCanAccessPeer(__VA_ARGS__)
+#else
+#include <cuda_runtime.h>
+#endif
+
+#include <iostream>
+
+#define CUDACHECK(cmd)                                                \
+  do {                                                                \
+    cudaError_t e = cmd;                                              \
+    if (e != cudaSuccess) {                                           \
+      std::cerr << "Failed: " #cmd << " returned " << e << std::endl; \
+      std::exit(EXIT_FAILURE);                                        \
+    }                                                                 \
+  } while (0)
+
+int main() {
+  bool canAccessPeerAll = true;
+  int devCount = 0;
+  CUDACHECK(cudaGetDeviceCount(&devCount));
+  std::cout << "Detected " << devCount << " device(s)" << std::endl;
+  if (devCount >= 2) {
+    for (int i = 0; i < devCount; ++i) {
+      for (int j = 0; j < devCount; ++j) {
+        if (i != j) {
+          int canAccessPeer = 0;
+          CUDACHECK(cudaDeviceCanAccessPeer(&canAccessPeer, i, j));
+          if (!canAccessPeer) {
+            canAccessPeerAll = false;
+            std::cerr << "Device " << i << " cannot access peer Device " << j << std::endl;
+          }
+        }
+      }
+    }
+  }
+  if (canAccessPeerAll) {
+    std::cout << "All devices can access each other" << std::endl;
+  }
+  return canAccessPeerAll ? 0 : 1;
+}