Skip to content

Commit a48197a

Browse files
Merge branch 'main' into rust_dlist_get_set
2 parents a2752da + 4cef361 commit a48197a

28 files changed

+275
-141
lines changed

.ci/jenkins/lib/build-container-matrix.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ steps:
5757
- name: Prepare
5858
run: |
5959
# Setup podman and dependencies
60+
rm -f /etc/containers/storage.conf
61+
podman system reset -f || true
6062
ln -sfT $(type -p podman) /usr/bin/docker
61-
yum install -y git gettext
63+
yum install -y gettext
6264
6365
- name: Build NIXLBench
6466
enable: ${ENABLE_NIXLBENCH_BUILD}

.ci/jenkins/lib/build-matrix.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ steps:
8484
parallel: false
8585
containerSelector: "{ name: 'podman.*' }"
8686
run: |
87+
# change storage driver to improve build performance
88+
rm -f /etc/containers/storage.conf ; podman system reset -f || true
8789
# symlink podman to docker - scripts works with docker commands
8890
ln -sfT $(type -p podman) /usr/bin/docker
89-
# install git for building container image
90-
yum install -y git
9191
contrib/build-container.sh --build-type debug --no-cache

.ci/jenkins/lib/test-matrix.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,28 @@ steps:
7171
# check rdma status
7272
ibv_devinfo
7373
#ib_write_bw
74+
# Ensure proper MPS configuration: MPS requires EXCLUSIVE_PROCESS compute mode
75+
# If MPS is running without EXCLUSIVE_PROCESS mode, that's a misconfiguration that causes hangs
76+
GPU_COUNT=$(nvidia-smi -L | wc -l)
77+
if [ "$GPU_COUNT" -gt 0 ]; then
78+
# Check if MPS is running
79+
MPS_RUNNING=false
80+
if command -v nvidia-cuda-mps-control >/dev/null 2>&1 && pgrep -f nvidia-cuda-mps-server >/dev/null 2>&1; then
81+
MPS_RUNNING=true
82+
fi
83+
84+
if [ "$MPS_RUNNING" = true ]; then
85+
# MPS is running - ensure EXCLUSIVE_PROCESS mode is set (required for MPS)
86+
for i in $(seq 0 $((GPU_COUNT-1))); do
87+
nvidia-smi -i $i -c EXCLUSIVE_PROCESS || true
88+
done
89+
else
90+
# MPS is not running - ensure DEFAULT mode
91+
for i in $(seq 0 $((GPU_COUNT-1))); do
92+
nvidia-smi -i $i -c DEFAULT || true
93+
done
94+
fi
95+
fi
7496
7597
7698
- name: Build GPU Test Environment

.gitlab/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ fi
117117
# Add DOCA repository and install packages
118118
ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi)
119119
MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)"
120-
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host_3.1.0-091000-25.07-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb
120+
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.0/host/doca-host_3.2.0-125000-25.10-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb
121121
$SUDO dpkg -i doca-host.deb
122122
$SUDO apt-get update
123123
$SUDO apt-get upgrade -y

.gitlab/test_cpp.sh

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,40 @@ export PATH=${INSTALL_DIR}/bin:$PATH
4141
export PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig:$PKG_CONFIG_PATH
4242
export NIXL_PLUGIN_DIR=${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins
4343

44+
# Set UCX GDA max system latency to allow GDA on SYS topology
45+
# TODO: Remove this once CI setups have better GPU-NIC locality
46+
# export UCX_IB_GDA_MAX_SYS_LATENCY=1us
47+
4448
echo "==== Show system info ===="
4549
env
4650
nvidia-smi topo -m || true
4751
ibv_devinfo || true
4852
uname -a || true
4953
cat /sys/devices/virtual/dmi/id/product_name || true
5054

55+
echo "==== NVIDIA Peermem check ===="
56+
if ! lsmod | grep -q nvidia_peermem; then
57+
echo "nvidia_peermem module not loaded"
58+
fi
59+
60+
if [ -f /sys/kernel/mm/memory_peers/nv_mem/version ]; then
61+
cat /sys/kernel/mm/memory_peers/nv_mem/version
62+
else
63+
echo "/sys/kernel/mm/memory_peers/nv_mem/version not found "
64+
fi
65+
66+
if [ -f /sys/module/nvidia_peermem/version ]; then
67+
cat /sys/module/nvidia_peermem/version
68+
else
69+
echo "/sys/module/nvidia_peermem/version not found"
70+
fi
71+
72+
if [ -f /sys/module/nv_peer_mem/version ]; then
73+
cat /sys/module/nv_peer_mem/version
74+
else
75+
echo "/sys/module/nv_peer_mem/version not found"
76+
fi
77+
5178
echo "==== Running ETCD server ===="
5279
etcd_port=$(get_next_tcp_port)
5380
etcd_peer_port=$(get_next_tcp_port)
@@ -89,7 +116,8 @@ kill -s INT $telePID
89116
# fi
90117

91118
# shellcheck disable=SC2154
92-
gtest-parallel --workers=1 --serialize_test_cases ./bin/gtest -- --min-tcp-port="$min_gtest_port" --max-tcp-port="$max_gtest_port"
119+
# TODO: enable PrepGpuSignal and ucxDeviceApi tests once the problem in UCX is fixed
120+
gtest-parallel --workers=1 --serialize_test_cases ./bin/gtest -- --min-tcp-port="$min_gtest_port" --max-tcp-port="$max_gtest_port" --gtest_filter=-*PrepGpuSignal*:*ucxDeviceApi*
93121
./bin/test_plugin
94122

95123
# Run NIXL client-server test

benchmark/nixlbench/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ make -j$(nproc) && sudo make install
300300
**DOCA (Optional):**
301301
```bash
302302
# Add Mellanox repository and install DOCA
303-
wget https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host_3.1.0-091000-25.07-ubuntu2404_amd64.deb
304-
sudo dpkg -i doca-host_3.1.0-091000-25.07-ubuntu2404_amd64.deb
303+
wget https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.0/host/doca-host_3.2.0-125000-25.10-ubuntu2404_amd64.deb -O doca-host.deb
304+
sudo dpkg -i doca-host.deb
305305
sudo apt-get update && sudo apt-get install -y doca-sdk-gpunetio libdoca-sdk-gpunetio-dev
306306
```
307307

benchmark/nixlbench/contrib/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ RUN apt-get update -y && \
5555
# Add DOCA repository and install packages
5656
RUN ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
5757
MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)" && \
58-
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host_3.1.0-091000-25.07-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb && \
58+
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.0/host/doca-host_3.2.0-125000-25.10-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb && \
5959
dpkg -i doca-host.deb && \
6060
apt-get update && \
6161
apt-get upgrade -y && \

contrib/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ RUN apt-get update -y && \
6868
# Add DOCA repository and install packages
6969
RUN ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
7070
MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)" && \
71-
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host_3.1.0-091000-25.07-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb && \
71+
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.0/host/doca-host_3.2.0-125000-25.10-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb && \
7272
dpkg -i doca-host.deb && \
7373
apt-get update && \
7474
apt-get upgrade -y && \

contrib/Dockerfile.manylinux

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,10 @@ RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.
152152
rm rustup-init && \
153153
chmod -R a+w $RUSTUP_HOME $CARGO_HOME
154154

155-
RUN wget https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host-3.1.0-091000_25.07_rhel89.${ARCH}.rpm && \
156-
rpm -i doca-host-3.1.0-091000_25.07_rhel89.${ARCH}.rpm && \
155+
RUN wget --tries=3 --waitretry=5 https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.0/host/doca-host-3.2.0-125000_25.10_rhel8.${ARCH}.rpm -O doca-host.rpm && \
156+
rpm -i doca-host.rpm && \
157157
dnf install -y libnl3-devel && \
158-
cd /usr/share/doca-host-3.1.0/repo/Packages/ && \
158+
cd /usr/share/doca-host-3.2.0/repo/Packages/ && \
159159
rpm -ivh --nodeps doca-sdk-common-*rpm && \
160160
rpm -ivh --nodeps doca-sdk-rdma-*rpm && \
161161
rpm -ivh --nodeps doca-sdk-verbs-*rpm && \

meson.build

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -127,25 +127,6 @@ else
127127
cuda_wheel_dir = 'nixl_cu12'
128128
endif
129129

130-
# DOCA GPUNETIO
131-
if cuda_dep.found()
132-
nvcc_cmd = find_program('nvcc', required: false)
133-
if nvcc_cmd.found()
134-
if nvcc_cmd.version().version_compare('>=12.8') and nvcc_cmd.version().version_compare('<13.0')
135-
doca_gpunetio_dep = dependency('doca-gpunetio', required : false)
136-
else
137-
warning('CUDA version = ' + nvcc_cmd.version() + ', GPUNETIO plugin will be disabled')
138-
doca_gpunetio_dep = disabler()
139-
endif
140-
else
141-
warning('nvcc not found, GPUNETIO plugin will be disabled')
142-
doca_gpunetio_dep = disabler()
143-
endif
144-
else
145-
warning('CUDA not found, GPUNETIO plugin will be disabled')
146-
doca_gpunetio_dep = disabler()
147-
endif
148-
149130
# Check for etcd-cpp-api - use multiple methods for discovery
150131
etcd_dep = dependency('etcd-cpp-api', required : false)
151132
etcd_inc_path = get_option('etcd_inc_path')
@@ -213,10 +194,11 @@ nvcc_prog = find_program('nvcc', required: false)
213194
ucx_gpu_device_api_available = false
214195
if ucx_dep.found() and cuda_dep.found() and nvcc_prog.found()
215196
cuda = meson.get_compiler('cuda')
197+
# TODO: Expose doca_gpunetio_dep through UCX
216198
have_gpu_side = cuda.compiles('''
217199
#include <ucp/api/device/ucp_device_impl.h>
218200
int main() { return 0; }
219-
''', dependencies : ucx_dep, args: nvcc_flags)
201+
''', dependencies : [ucx_dep, doca_gpunetio_dep], args: nvcc_flags)
220202

221203
have_host_side = cpp.compiles('''
222204
#include <ucp/api/device/ucp_host.h>
@@ -233,6 +215,7 @@ if ucx_dep.found() and cuda_dep.found() and nvcc_prog.found()
233215
'GPU-side compile' : have_gpu_side,
234216
'Host-side compile' : have_host_side,
235217
'nvcc available' : nvcc_prog.found(),
218+
'DOCA GPUNETIO found': doca_gpunetio_dep.found(),
236219
}, section: 'UCX GPU Device API', bool_yn: true)
237220
endif
238221

0 commit comments

Comments
 (0)