From f1aefa401795711af10eddda86c7cea6b02621d0 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Tue, 4 Nov 2025 17:56:41 +0200 Subject: [PATCH 1/9] BUILD: Fix UCX GPU device API detection Signed-off-by: Michal Shalev --- meson.build | 5 +++-- test/gtest/meson.build | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 57b87f4c3..6dc164ec4 100644 --- a/meson.build +++ b/meson.build @@ -128,7 +128,7 @@ endif if cuda_dep.found() nvcc_cmd = find_program('nvcc', required: false) if nvcc_cmd.found() - if nvcc_cmd.version().version_compare('>=12.8') and nvcc_cmd.version().version_compare('<13.0') + if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else warning('CUDA version = ' + nvcc_cmd.version() + ', GPUNETIO plugin will be disabled') @@ -213,7 +213,7 @@ if ucx_dep.found() and cuda_dep.found() and nvcc_prog.found() have_gpu_side = cuda.compiles(''' #include int main() { return 0; } - ''', dependencies : ucx_dep, args: nvcc_flags) + ''', dependencies : [ucx_dep, doca_gpunetio_dep], args: nvcc_flags) have_host_side = cpp.compiles(''' #include @@ -230,6 +230,7 @@ if ucx_dep.found() and cuda_dep.found() and nvcc_prog.found() 'GPU-side compile' : have_gpu_side, 'Host-side compile' : have_host_side, 'nvcc available' : nvcc_prog.found(), + 'DOCA GPUNETIO found': doca_gpunetio_dep.found(), }, section: 'UCX GPU Device API', bool_yn: true) endif diff --git a/test/gtest/meson.build b/test/gtest/meson.build index 71dd2a0e1..80bd0010c 100644 --- a/test/gtest/meson.build +++ b/test/gtest/meson.build @@ -72,7 +72,7 @@ gtest_sources = [ if ucx_gpu_device_api_available gtest_sources += device_api_test_sources device_api_inc = [nixl_gpu_inc_dirs, include_directories('device_api')] - device_api_dep = ucx_dep + device_api_dep = [ucx_dep, doca_gpunetio_dep] else device_api_inc = [] device_api_dep = [] From 170056907c495553f8e8c3dc5ada74e2c90e71c4 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Wed, 5 Nov 2025 01:29:48 +0200 Subject: [PATCH 2/9] Skip PrepGpuSignal test for UCX_MO Signed-off-by: Michal Shalev --- test/gtest/test_transfer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/gtest/test_transfer.cpp b/test/gtest/test_transfer.cpp index 73abbdd42..396752820 100644 --- a/test/gtest/test_transfer.cpp +++ b/test/gtest/test_transfer.cpp @@ -702,6 +702,11 @@ TEST_P(TestTransfer, PrepGpuSignal) { #ifndef HAVE_UCX_GPU_DEVICE_API GTEST_SKIP() << "UCX GPU device API not available, skipping test"; #else + // UCX_MO backend does not support GPU signals + if (getBackendName() == "UCX_MO") { + GTEST_SKIP() << "UCX_MO backend does not support GPU signals"; + } + size_t gpu_signal_size = 0; nixl_opt_args_t extra_params = {.backends = {backend_handles[0]}}; nixl_status_t size_status = getAgent(0).getGpuSignalSize(gpu_signal_size, &extra_params); From 0307756cbd96a20c29c8a4f3fced133908135c6a Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Sun, 9 Nov 2025 17:26:53 +0200 Subject: [PATCH 3/9] PR fixes --- meson.build | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 6dc164ec4..85eb71fac 100644 --- a/meson.build +++ b/meson.build @@ -126,12 +126,11 @@ endif # DOCA GPUNETIO if cuda_dep.found() - nvcc_cmd = find_program('nvcc', required: false) - if nvcc_cmd.found() + if nvcc.found() if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else - warning('CUDA version = ' + nvcc_cmd.version() + ', GPUNETIO plugin will be disabled') + warning('CUDA version = ' + nvcc.version() + ', GPUNETIO plugin will be disabled') doca_gpunetio_dep = disabler() endif else @@ -210,6 +209,7 @@ nvcc_prog = find_program('nvcc', required: false) ucx_gpu_device_api_available = false if ucx_dep.found() and cuda_dep.found() and nvcc_prog.found() cuda = meson.get_compiler('cuda') + # TODO: Expose doca_gpunetio_dep through UCX have_gpu_side = cuda.compiles(''' #include int main() { return 0; } From 9c14b3fcd1e78fb86bfbfc7d08791e3c2aef07b2 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Sun, 9 Nov 2025 17:30:41 +0200 Subject: [PATCH 4/9] PR fixes 2.0 --- test/gtest/test_transfer.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/gtest/test_transfer.cpp b/test/gtest/test_transfer.cpp index d527298e6..c9a3668c0 100644 --- a/test/gtest/test_transfer.cpp +++ b/test/gtest/test_transfer.cpp @@ -697,11 +697,6 @@ TEST_P(TestTransfer, PrepGpuSignal) { #ifndef HAVE_UCX_GPU_DEVICE_API GTEST_SKIP() << "UCX GPU device API not available, skipping test"; #else - // UCX_MO backend does not support GPU signals - if (getBackendName() == "UCX_MO") { - GTEST_SKIP() << "UCX_MO backend does not support GPU signals"; - } - size_t gpu_signal_size = 0; nixl_opt_args_t extra_params = {.backends = {backend_handles[0]}}; nixl_status_t size_status = getAgent(0).getGpuSignalSize(gpu_signal_size, &extra_params); From 0dc38fce4c0c6edb91e6434555cf88c7aeff05b3 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Sun, 9 Nov 2025 18:00:59 +0200 Subject: [PATCH 5/9] PR fixes 3.0 --- meson.build | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 30040d956..62bf48df8 100644 --- a/meson.build +++ b/meson.build @@ -129,7 +129,8 @@ endif # DOCA GPUNETIO if cuda_dep.found() - if nvcc.found() + nvcc_cmd = find_program('nvcc', required: false) + if nvcc_cmd.found() if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else From 1e3728050bb732fdf6742655983c68f798bd34e6 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Wed, 12 Nov 2025 15:12:26 +0200 Subject: [PATCH 6/9] PR fixes Signed-off-by: Michal Shalev --- meson.build | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/meson.build b/meson.build index 62bf48df8..2cffcfdda 100644 --- a/meson.build +++ b/meson.build @@ -120,30 +120,27 @@ if cuda_dep.found() else error('Unsupported CUDA version: ' + cuda_version_major) endif + + # DOCA GPUNETIO + nvcc_cmd = find_program('nvcc', required: false) + if nvcc_cmd.found() + if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') + doca_gpunetio_dep = dependency('doca-gpunetio', required : false) + else + warning('CUDA version = ' + nvcc.version() + ', GPUNETIO plugin will be disabled') + doca_gpunetio_dep = disabler() + endif + else + warning('nvcc not found, GPUNETIO plugin will be disabled') + doca_gpunetio_dep = disabler() + endif else warning('CUDA not found. UCX backend will be built without CUDA support, and some plugins will be disabled.') doca_gpunetio_dep = disabler() warning('CUDA not found, cannot autodetect wheel dir; defaulting to nixl_cu12') cuda_wheel_dir = 'nixl_cu12' -endif - -# DOCA GPUNETIO -if cuda_dep.found() - nvcc_cmd = find_program('nvcc', required: false) - if nvcc_cmd.found() - if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') - doca_gpunetio_dep = dependency('doca-gpunetio', required : false) - else - warning('CUDA version = ' + nvcc.version() + ', GPUNETIO plugin will be disabled') - doca_gpunetio_dep = disabler() - endif - else - warning('nvcc not found, GPUNETIO plugin will be disabled') + warning('CUDA not found, GPUNETIO plugin will be disabled') doca_gpunetio_dep = disabler() - endif -else - warning('CUDA not found, GPUNETIO plugin will be disabled') - doca_gpunetio_dep = disabler() endif # Check for etcd-cpp-api - use multiple methods for discovery From e190f3c32d5432715b44774507bef0ddf6f89e39 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Wed, 12 Nov 2025 18:13:21 +0200 Subject: [PATCH 7/9] PR fixes Signed-off-by: Michal Shalev --- meson.build | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/meson.build b/meson.build index f9e48af64..f4cc9ef5a 100644 --- a/meson.build +++ b/meson.build @@ -122,16 +122,10 @@ if cuda_dep.found() endif # DOCA GPUNETIO - nvcc_cmd = find_program('nvcc', required: false) - if nvcc_cmd.found() - if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') - doca_gpunetio_dep = dependency('doca-gpunetio', required : false) - else - warning('CUDA version = ' + nvcc.version() + ', GPUNETIO plugin will be disabled') - doca_gpunetio_dep = disabler() - endif + if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') + doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else - warning('nvcc not found, GPUNETIO plugin will be disabled') + warning('CUDA version = ' + nvcc.version() + ', GPUNETIO plugin will be disabled') doca_gpunetio_dep = disabler() endif else From c6b62ab4925ff60a7df403cc35cf3ff1565d29f0 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Wed, 12 Nov 2025 20:11:34 +0200 Subject: [PATCH 8/9] PR fixes Signed-off-by: Michal Shalev --- meson.build | 8 -------- 1 file changed, 8 deletions(-) diff --git a/meson.build b/meson.build index f4cc9ef5a..6f54e71ba 100644 --- a/meson.build +++ b/meson.build @@ -120,14 +120,6 @@ if cuda_dep.found() else error('Unsupported CUDA version: ' + cuda_version_major) endif - - # DOCA GPUNETIO - if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') - doca_gpunetio_dep = dependency('doca-gpunetio', required : false) - else - warning('CUDA version = ' + nvcc.version() + ', GPUNETIO plugin will be disabled') - doca_gpunetio_dep = disabler() - endif else warning('CUDA not found. UCX backend will be built without CUDA support, and some plugins will be disabled.') doca_gpunetio_dep = disabler() From 5e118a6b06e98ed799ebf2998021840b0d2cfeb4 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Sun, 16 Nov 2025 17:10:06 +0200 Subject: [PATCH 9/9] Set UCX_IB_GDA_MAX_SYS_LATENCY Signed-off-by: Michal Shalev --- .gitlab/test_cpp.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab/test_cpp.sh b/.gitlab/test_cpp.sh index b5984edae..625032284 100755 --- a/.gitlab/test_cpp.sh +++ b/.gitlab/test_cpp.sh @@ -41,6 +41,10 @@ export PATH=${INSTALL_DIR}/bin:$PATH export PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig:$PKG_CONFIG_PATH export NIXL_PLUGIN_DIR=${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins +# Set UCX GDA max system latency to allow GDA on SYS topology +# TODO: Remove this once CI setups have better GPU-NIC locality +export UCX_IB_GDA_MAX_SYS_LATENCY=1us + echo "==== Show system info ====" env nvidia-smi topo -m || true