From 61acfa97652086f053cfd9124a9c2ef0a137a945 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 5 Nov 2025 13:12:32 +0200 Subject: [PATCH 01/15] CI: Update cuda ver for GPU tests to 13 Signed-off-by: Alexey Rivkin --- .ci/dockerfiles/Dockerfile.gpu_test | 4 ++-- .ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md | 2 +- .ci/jenkins/lib/build-matrix.yaml | 6 +++--- .ci/jenkins/lib/test-matrix.yaml | 2 +- .ci/jenkins/pipeline/proj-jjb.yaml | 4 ++-- benchmark/nixlbench/README.md | 2 +- benchmark/nixlbench/contrib/Dockerfile | 2 +- benchmark/nixlbench/contrib/build.sh | 2 +- contrib/Dockerfile | 2 +- contrib/aws-efa/README.md | 2 +- contrib/aws-efa/aws_job_def.json | 2 +- contrib/aws-efa/aws_test.sh | 4 ++-- contrib/build-container.sh | 5 +++-- 13 files changed, 20 insertions(+), 19 deletions(-) diff --git a/.ci/dockerfiles/Dockerfile.gpu_test b/.ci/dockerfiles/Dockerfile.gpu_test index c441bea6b..36fa826f6 100644 --- a/.ci/dockerfiles/Dockerfile.gpu_test +++ b/.ci/dockerfiles/Dockerfile.gpu_test @@ -13,7 +13,7 @@ # docker run --gpus all --privileged -it nixl-gpu-test # # Build arguments: -# BASE_IMAGE: Base NVIDIA cuda-dl-base image (default: nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04) +# BASE_IMAGE: Base NVIDIA cuda-dl-base image (default: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04) # _UID: User ID for the non-root user (default: 148069) # _GID: Group ID for the user (default: 30) # _LOGIN: Username (default: svc-nixl) @@ -22,7 +22,7 @@ # WORKSPACE: Workspace directory path # -ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 +ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 FROM ${BASE_IMAGE} diff --git a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md index c92469584..f3b59dc8a 100644 --- a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md +++ b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md @@ -137,7 +137,7 @@ sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker ``` -Verify GPU access in containers using `docker run --gpus all nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 nvidia-smi`[^1_3]. +Verify GPU access in containers using `docker run --gpus all nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 nvidia-smi`[^1_3]. ### 9. **Validation and Troubleshooting** diff --git a/.ci/jenkins/lib/build-matrix.yaml b/.ci/jenkins/lib/build-matrix.yaml index 1ea9b3637..f52e65d71 100644 --- a/.ci/jenkins/lib/build-matrix.yaml +++ b/.ci/jenkins/lib/build-matrix.yaml @@ -6,7 +6,7 @@ # Key Components: # - Job Configuration: Defines timeout, failure behavior, and Kubernetes resources # - Docker Images: Specifies the container images used for different build stages -# - cuda-dl-base images (25.06 for Ubuntu 24.04, 24.10 for Ubuntu 22.04) for building and testing +# - cuda-dl-base images (25.10 for Ubuntu 24.04, 13.0.1 for Ubuntu 22.04) for building and testing # - Podman image for container builds # - Matrix Axes: Defines build variations (currently x86_64 architecture) # - Build Steps: Sequential steps for building, testing, and container creation @@ -34,8 +34,8 @@ kubernetes: requests: "{memory: 8Gi, cpu: 8000m}" runs_on_dockers: - - { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" } - - { name: "ubuntu22.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:24.10-cuda12.6-devel-ubuntu22.04" } + - { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04" } + - { name: "ubuntu22.04-cuda-dl-base", url: "nvidia/cuda:13.0.1-devel-ubuntu22.04" } - { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true } matrix: diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 34836cb2f..face1fc5c 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -30,7 +30,7 @@ runs_on_agents: matrix: axes: image: - - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 + - nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 arch: - x86_64 ucx_version: diff --git a/.ci/jenkins/pipeline/proj-jjb.yaml b/.ci/jenkins/pipeline/proj-jjb.yaml index 69aa2f00d..7cc7436a1 100644 --- a/.ci/jenkins/pipeline/proj-jjb.yaml +++ b/.ci/jenkins/pipeline/proj-jjb.yaml @@ -280,7 +280,7 @@ description: "Base Docker image for the container build" - string: name: "BASE_IMAGE_TAG" - default: "25.06-cuda12.9-devel-ubuntu24.04" + default: "25.10-cuda13.0-devel-ubuntu24.04" description: "Tag for the base Docker image" - string: name: "TAG_SUFFIX" @@ -294,7 +294,7 @@ description: > Update the latest tag for this architecture.
When enabled, also creates: <base-image-tag>-<arch>-latest
- Example: 25.06-cuda12.9-devel-ubuntu24.04-aarch64-latest
+ Example: 25.10-cuda13.0-devel-ubuntu24.04-aarch64-latest
- string: name: "MAIL_TO" default: "25f58ae0.NVIDIA.onmicrosoft.com@amer.teams.ms" diff --git a/benchmark/nixlbench/README.md b/benchmark/nixlbench/README.md index fb37927c0..3bc6371f7 100644 --- a/benchmark/nixlbench/README.md +++ b/benchmark/nixlbench/README.md @@ -172,7 +172,7 @@ cd nixl/benchmark/nixlbench/contrib | `--ucx ` | Path to custom UCX source (optional) | Uses base image UCX | | `--build-type ` | Build type: `debug` or `release` | `release` | | `--base-image ` | Base Docker image | `nvcr.io/nvidia/cuda-dl-base` | -| `--base-image-tag ` | Base image tag | `25.06-cuda12.9-devel-ubuntu24.04` | +| `--base-image-tag ` | Base image tag | `25.10-cuda13.0-devel-ubuntu24.04` | | `--arch ` | Target architecture: `x86_64` or `aarch64` | Auto-detected | | `--python-versions ` | Python versions (comma-separated) | `3.12` | | `--tag ` | Custom Docker image tag | Auto-generated | diff --git a/benchmark/nixlbench/contrib/Dockerfile b/benchmark/nixlbench/contrib/Dockerfile index 2c6ee0e01..fb50f6873 100644 --- a/benchmark/nixlbench/contrib/Dockerfile +++ b/benchmark/nixlbench/contrib/Dockerfile @@ -14,7 +14,7 @@ # limitations under the License. ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" -ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04" # UCX argument is either "upstream" (default installed in base image) or "custom" (build from source) ARG UCX="upstream" diff --git a/benchmark/nixlbench/contrib/build.sh b/benchmark/nixlbench/contrib/build.sh index 2571d366f..dbd2eea33 100755 --- a/benchmark/nixlbench/contrib/build.sh +++ b/benchmark/nixlbench/contrib/build.sh @@ -35,7 +35,7 @@ if [ -z ${latest_tag} ]; then fi BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base -BASE_IMAGE_TAG=25.06-cuda12.9-devel-ubuntu24.04 +BASE_IMAGE_TAG=25.10-cuda13.0-devel-ubuntu24.04 ARCH=$(uname -m) [ "$ARCH" = "arm64" ] && ARCH="aarch64" WHL_BASE=manylinux_2_39 diff --git a/contrib/Dockerfile b/contrib/Dockerfile index 07619cce9..b762dfdaa 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -14,7 +14,7 @@ # limitations under the License. ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" -ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04" ARG OS FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} diff --git a/contrib/aws-efa/README.md b/contrib/aws-efa/README.md index cd74a7e06..58e171cf5 100644 --- a/contrib/aws-efa/README.md +++ b/contrib/aws-efa/README.md @@ -89,7 +89,7 @@ The AWS test script: ## Container Image -The script uses the container image: `nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04` +The script uses the container image: `nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04` You can override this by setting the `CONTAINER_IMAGE` environment variable: ```bash diff --git a/contrib/aws-efa/aws_job_def.json b/contrib/aws-efa/aws_job_def.json index 6477a16aa..5a539fb91 100644 --- a/contrib/aws-efa/aws_job_def.json +++ b/contrib/aws-efa/aws_job_def.json @@ -15,7 +15,7 @@ "imagePullSecrets": [], "containers": [ { - "image": "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04", + "image": "nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04", "command": [ "/bin/bash", "-c", diff --git a/contrib/aws-efa/aws_test.sh b/contrib/aws-efa/aws_test.sh index 0477757a7..44694e08f 100755 --- a/contrib/aws-efa/aws_test.sh +++ b/contrib/aws-efa/aws_test.sh @@ -30,7 +30,7 @@ usage() { echo " GITHUB_REPOSITORY - GitHub repository (e.g., \"ai-dynamo/nixl\")" echo "" echo "Optional environment variables:" - echo " CONTAINER_IMAGE - Container image to use (default: nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04)" + echo " CONTAINER_IMAGE - Container image to use (default: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04)" echo " TEST_TIMEOUT - Timeout for test execution in minutes" exit 1 } @@ -47,7 +47,7 @@ if [ -z "$GITHUB_REF" ] || [ -z "$GITHUB_SERVER_URL" ] || [ -z "$GITHUB_REPOSITO fi test_cmd="$1" -export CONTAINER_IMAGE=${CONTAINER_IMAGE:-"nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04"} +export CONTAINER_IMAGE=${CONTAINER_IMAGE:-"nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04"} # Set Git checkout command based on GITHUB_REF case "$GITHUB_REF" in diff --git a/contrib/build-container.sh b/contrib/build-container.sh index 694ecfd6e..5e40002fd 100755 --- a/contrib/build-container.sh +++ b/contrib/build-container.sh @@ -29,7 +29,7 @@ fi VERSION=v$latest_tag.dev.$commit_id BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base -BASE_IMAGE_TAG=25.06-cuda12.9-devel-ubuntu24.04 +BASE_IMAGE_TAG=25.10-cuda13.0-devel-ubuntu24.04 ARCH=$(uname -m) [ "$ARCH" = "arm64" ] && ARCH="aarch64" WHL_BASE=manylinux_2_39 @@ -135,7 +135,8 @@ get_options() { done if [[ $OS == "ubuntu22" ]]; then - BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04 + BASE_IMAGE=nvidia/cuda + BASE_IMAGE_TAG=13.0.1-devel-ubuntu22.04 WHL_BASE=${WHL_BASE:-manylinux_2_34} fi From 25da816eec031578bb9137afecf3affd93561657 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Thu, 6 Nov 2025 22:03:13 +0200 Subject: [PATCH 02/15] Update CUDA ver validation in tests and readme Signed-off-by: Alexey Rivkin --- .gitlab/test_rust.sh | 2 +- benchmark/nixlbench/README.md | 10 +++++----- meson.build | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.gitlab/test_rust.sh b/.gitlab/test_rust.sh index dd8f38682..caafbf999 100755 --- a/.gitlab/test_rust.sh +++ b/.gitlab/test_rust.sh @@ -36,7 +36,7 @@ which cargo cargo --version export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins:/usr/local/lib:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/cuda-12.8/compat:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/cuda-13.0/compat:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH export CPATH=${INSTALL_DIR}/include:$CPATH export PATH=${INSTALL_DIR}/bin:$PATH diff --git a/benchmark/nixlbench/README.md b/benchmark/nixlbench/README.md index 3bc6371f7..af6657de8 100644 --- a/benchmark/nixlbench/README.md +++ b/benchmark/nixlbench/README.md @@ -65,7 +65,7 @@ A comprehensive benchmarking tool for the NVIDIA Inference Xfer Library (NIXL) t - **Operating System**: Ubuntu 22.04/24.04 LTS (recommended) or RHEL-based - **Docker**: Version 20.10+ (for container builds) - **Git**: For source code management -- **CUDA Toolkit**: 12.8+ (for GPU features) +- **CUDA Toolkit**: 13.0+ (for GPU features) - **Python**: 3.12+ (for benchmark utilities) ## Quick Start @@ -187,7 +187,7 @@ For development environments or when Docker is not available. **Required:** - **NIXL**: Core communication library - **UCX**: Unified Communication X library -- **CUDA**: NVIDIA CUDA Toolkit (≥12.8) +- **CUDA**: NVIDIA CUDA Toolkit (≥13.0) - **CMake**: Build system (≥3.20) - **Meson**: Build system for NIXL/NIXLBench - **Ninja**: Build backend @@ -234,9 +234,9 @@ sudo apt-get reinstall -y --no-install-recommends \ #### CUDA Toolkit Installation ```bash -# Download and install CUDA 12.8 -wget https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_550.54.15_linux.run -sudo sh cuda_12.8.0_550.54.15_linux.run +# Download and install CUDA 13.0 +wget https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run +sudo sh cuda_13.0.2_580.95.05_linux.run # Set environment variables export PATH=/usr/local/cuda/bin:$PATH diff --git a/meson.build b/meson.build index 60b5a5699..f53b2776d 100644 --- a/meson.build +++ b/meson.build @@ -104,7 +104,7 @@ if cuda_dep.found() nvcc_flags_link += ['-gencode=arch=compute_90,code=sm_90'] add_project_link_arguments(nvcc_flags_link, language: 'cuda') message('nvcc version: ' + nvcc.version()) - if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') + if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<14.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else warning('GPUNETIO plugin not supported in CUDA version: ' + nvcc.version()) @@ -131,7 +131,7 @@ endif if cuda_dep.found() nvcc_cmd = find_program('nvcc', required: false) if nvcc_cmd.found() - if nvcc_cmd.version().version_compare('>=12.8') and nvcc_cmd.version().version_compare('<13.0') + if nvcc_cmd.version().version_compare('>=12.8') and nvcc_cmd.version().version_compare('<14.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else warning('CUDA version = ' + nvcc_cmd.version() + ', GPUNETIO plugin will be disabled') From 993b9af3862ffbd5458d25e3d5037c14463bb962 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Sun, 9 Nov 2025 13:35:03 +0200 Subject: [PATCH 03/15] Switch default Python package dependency to nixl-cu13 Signed-off-by: Alexey Rivkin --- contrib/Dockerfile.manylinux | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index eb66ee886..97d43c130 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -281,8 +281,8 @@ RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ done # Copy the meta package wheel to the dist directory, which will be used to push to PyPI. -# Only do this for the CUDA 12 builds, since by default nixl depends on nixl-cu12. -RUN if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \ +# Only do this for the CUDA 13 builds, since by default nixl depends on nixl-cu13. +RUN if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "13" ]; then \ cp build/src/bindings/python/nixl-meta/nixl*.whl dist/; \ fi From e0160d3677da1c4d4c1b55649a6bdafa16009791 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Sun, 9 Nov 2025 14:49:43 +0200 Subject: [PATCH 04/15] Fix POSIX shell compatibility in common.sh Replace bash [[ ]] with POSIX [ ] in container detection. Scripts using #!/bin/sh failed on [[ syntax, causing NPROC to default to 256 CPUs instead of memory-based limit, leading to OOM. Signed-off-by: Alexey Rivkin --- .ci/scripts/common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/common.sh b/.ci/scripts/common.sh index 9406fb8ea..e1219138b 100755 --- a/.ci/scripts/common.sh +++ b/.ci/scripts/common.sh @@ -98,7 +98,7 @@ export TEST_LIBFABRIC=${TEST_LIBFABRIC:-false} # Set default parallelism for make/ninja (can be overridden by NPROC env var) if [ -z "$NPROC" ]; then # In containers, calculate based on memory limits to avoid OOM - if [[ -f /.dockerenv || -f /run/.containerenv || -n "${KUBERNETES_SERVICE_HOST}" ]]; then + if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) elif [ -f /sys/fs/cgroup/memory.max ]; then From cbf9de4055d8884aca86f804ddb2b5bf27a2c662 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Mon, 10 Nov 2025 01:01:03 +0200 Subject: [PATCH 05/15] Update cuda ver in toml to align with Dockerfile.manylinux Signed-off-by: Alexey Rivkin --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1fc21b772..2cb2c817d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ requires = ["meson-python", "pybind11", "patchelf", "pyyaml", "types-PyYAML", "p build-backend = "mesonpy" [project] -name = 'nixl-cu12' +name = 'nixl-cu13' version = '0.7.1' description = 'NIXL Python API' readme = 'README.md' From e49f13bde85fcd212345c075acaf15e0a750045c Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Mon, 10 Nov 2025 11:56:42 +0200 Subject: [PATCH 06/15] Revert PyPi defaults back to cuda 12 Signed-off-by: Alexey Rivkin --- contrib/Dockerfile.manylinux | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 97d43c130..eb66ee886 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -281,8 +281,8 @@ RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ done # Copy the meta package wheel to the dist directory, which will be used to push to PyPI. -# Only do this for the CUDA 13 builds, since by default nixl depends on nixl-cu13. -RUN if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "13" ]; then \ +# Only do this for the CUDA 12 builds, since by default nixl depends on nixl-cu12. +RUN if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \ cp build/src/bindings/python/nixl-meta/nixl*.whl dist/; \ fi diff --git a/pyproject.toml b/pyproject.toml index 2cb2c817d..1fc21b772 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ requires = ["meson-python", "pybind11", "patchelf", "pyyaml", "types-PyYAML", "p build-backend = "mesonpy" [project] -name = 'nixl-cu13' +name = 'nixl-cu12' version = '0.7.1' description = 'NIXL Python API' readme = 'README.md' From be23419422fbe82d341025c2b08818daea3a8502 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Tue, 11 Nov 2025 12:03:16 +0200 Subject: [PATCH 07/15] Revert the GPUNETIO package default GPUNETIO plugin does not work with CUDA 13.0 at the moment, because DOCA 3.1 still links against CUDA 12 libraries. Signed-off-by: Alexey Rivkin --- meson.build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index f53b2776d..60b5a5699 100644 --- a/meson.build +++ b/meson.build @@ -104,7 +104,7 @@ if cuda_dep.found() nvcc_flags_link += ['-gencode=arch=compute_90,code=sm_90'] add_project_link_arguments(nvcc_flags_link, language: 'cuda') message('nvcc version: ' + nvcc.version()) - if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<14.0') + if nvcc.version().version_compare('>=12.8') and nvcc.version().version_compare('<13.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else warning('GPUNETIO plugin not supported in CUDA version: ' + nvcc.version()) @@ -131,7 +131,7 @@ endif if cuda_dep.found() nvcc_cmd = find_program('nvcc', required: false) if nvcc_cmd.found() - if nvcc_cmd.version().version_compare('>=12.8') and nvcc_cmd.version().version_compare('<14.0') + if nvcc_cmd.version().version_compare('>=12.8') and nvcc_cmd.version().version_compare('<13.0') doca_gpunetio_dep = dependency('doca-gpunetio', required : false) else warning('CUDA version = ' + nvcc_cmd.version() + ', GPUNETIO plugin will be disabled') From 7d43f22da961c3f2aa077ada46c8260eaaf9ca42 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Tue, 11 Nov 2025 12:38:07 +0200 Subject: [PATCH 08/15] Pass NPROC var to Docker commands Signed-off-by: Alexey Rivkin --- .ci/jenkins/lib/test-matrix.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index face1fc5c..db6dc270a 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -98,7 +98,7 @@ steps: parallel: false run: | set -ex - docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}" + docker exec -e NPROC -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}" onfail: | docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" @@ -107,7 +107,7 @@ steps: - name: Test CPP parallel: false run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}" + timeout ${TEST_TIMEOUT}m docker exec -e NPROC -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}" onfail: | docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" @@ -115,7 +115,7 @@ steps: - name: Test Python parallel: false run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}" + timeout ${TEST_TIMEOUT}m docker exec -e NPROC -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}" onfail: | docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" @@ -123,7 +123,7 @@ steps: - name: Test Nixlbench parallel: false run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}" + timeout ${TEST_TIMEOUT}m docker exec -e NPROC -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}" onfail: | docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" @@ -131,7 +131,7 @@ steps: - name: Test Rust parallel: false run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}" + timeout ${TEST_TIMEOUT}m docker exec -e NPROC -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}" always: | docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" From 30433bee23f9a52d551a19f1952cefb5b38b2d38 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Tue, 11 Nov 2025 13:33:44 +0200 Subject: [PATCH 09/15] Add both CUDA 12 and 13 to the GPU test GPUNetIO is only tested on CUDA12, so adding both allows testing for one with GPUNETIO and one WO. Signed-off-by: Alexey Rivkin --- .ci/jenkins/lib/test-matrix.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index db6dc270a..915cfb99c 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -31,6 +31,7 @@ matrix: axes: image: - nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 + - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 arch: - x86_64 ucx_version: From 419fc7710f80f4d81bff1447f8bb7489c31a5d41 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Tue, 11 Nov 2025 15:09:08 +0200 Subject: [PATCH 10/15] Improve visibility by showing cuda ver in job name Signed-off-by: Alexey Rivkin --- .ci/jenkins/lib/test-matrix.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 915cfb99c..3256fa24e 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -29,16 +29,21 @@ runs_on_agents: matrix: axes: - image: - - nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 - - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 + cuda_ver: + - cuda12 + - cuda13 arch: - x86_64 ucx_version: - master - v1.19.0 + include: + - cuda_ver: cuda12 + image: nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 + - cuda_ver: cuda13 + image: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 -taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}" +taskName: "${name}/${arch}/${cuda_ver}/ucx-${ucx_version}/${axis_index}" env: CONTAINER_WORKSPACE: /workspace From bedfa8be23a04060ebf26fb11f004d43664b5294 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Tue, 11 Nov 2025 15:50:23 +0200 Subject: [PATCH 11/15] Revert "Improve visibility by showing cuda ver in job name" This reverts commit 419fc7710f80f4d81bff1447f8bb7489c31a5d41. --- .ci/jenkins/lib/test-matrix.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index f288eec19..dc27ad2c8 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -29,9 +29,9 @@ runs_on_agents: matrix: axes: - cuda_ver: - - cuda12 - - cuda13 + image: + - nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 + - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 arch: - x86_64 ucx_version: @@ -43,7 +43,7 @@ matrix: - cuda_ver: cuda13 image: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 -taskName: "${name}/${arch}/${cuda_ver}/ucx-${ucx_version}/${axis_index}" +taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}" env: CONTAINER_WORKSPACE: /workspace From 32da639b4dbae770eaa959cf84e4adeed0fc6f65 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Tue, 11 Nov 2025 16:14:59 +0200 Subject: [PATCH 12/15] Revert "Improve visibility by showing cuda ver in job name" Signed-off-by: Alexey Rivkin --- .ci/jenkins/lib/test-matrix.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index dc27ad2c8..06c15aee9 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -37,11 +37,6 @@ matrix: ucx_version: - master - v1.20.x - include: - - cuda_ver: cuda12 - image: nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 - - cuda_ver: cuda13 - image: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}" From 8d3ee0b04f2f7bc9d1c7d7b54f7d5edb73a70eb9 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 12 Nov 2025 10:31:50 +0200 Subject: [PATCH 13/15] Update libfabric ver to 1.21 This should resolve provider hangs during AWS tests Signed-off-by: Alexey Rivkin --- src/utils/libfabric/libfabric_common.cpp | 2 +- src/utils/libfabric/libfabric_rail.cpp | 4 ++-- src/utils/libfabric/libfabric_topology.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/utils/libfabric/libfabric_common.cpp b/src/utils/libfabric/libfabric_common.cpp index 140694e11..ec945cedb 100644 --- a/src/utils/libfabric/libfabric_common.cpp +++ b/src/utils/libfabric/libfabric_common.cpp @@ -50,7 +50,7 @@ getAvailableNetworkDevices() { hints->mode = FI_CONTEXT; hints->ep_attr->type = FI_EP_RDM; - int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + int ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); if (ret) { NIXL_ERROR << "fi_getinfo failed " << fi_strerror(-ret); fi_freeinfo(hints); diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp index f5b155c7b..6f0fa6fd4 100644 --- a/src/utils/libfabric/libfabric_rail.cpp +++ b/src/utils/libfabric/libfabric_rail.cpp @@ -431,7 +431,7 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device, hints->domain_attr->threading = FI_THREAD_SAFE; try { // Get fabric info for this specific device - first try with FI_HMEM - int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + int ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); // If no provider found with FI_HMEM, retry without it if (ret || !info) { @@ -442,7 +442,7 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device, hints->caps = FI_MSG | FI_RMA; hints->caps |= FI_LOCAL_COMM | FI_REMOTE_COMM; - ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); if (ret) { NIXL_ERROR << "fi_getinfo failed for rail " << rail_id << ": " << fi_strerror(-ret); throw std::runtime_error("fi_getinfo failed for rail " + std::to_string(rail_id)); diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp index ff428bc96..bac8c25bb 100644 --- a/src/utils/libfabric/libfabric_topology.cpp +++ b/src/utils/libfabric/libfabric_topology.cpp @@ -381,7 +381,7 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() { // This ensures consistency between device discovery and PCIe mapping hints->fabric_attr->prov_name = strdup(provider_name.c_str()); - int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + int ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); if (ret) { NIXL_ERROR << "fi_getinfo failed for PCIe mapping with provider " << provider_name << ": " << fi_strerror(-ret); From 1c3324b8c5bc33d4d2565d3131e94ccf55987dbf Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Thu, 13 Nov 2025 14:02:50 +0200 Subject: [PATCH 14/15] Revert CUDA upgrade for AWS EFA tests libfabric hangs and test fail on timeout when CUDA 13 umages are used Signed-off-by: Alexey Rivkin --- contrib/aws-efa/README.md | 2 +- contrib/aws-efa/aws_job_def.json | 2 +- contrib/aws-efa/aws_test.sh | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/aws-efa/README.md b/contrib/aws-efa/README.md index 58e171cf5..cd74a7e06 100644 --- a/contrib/aws-efa/README.md +++ b/contrib/aws-efa/README.md @@ -89,7 +89,7 @@ The AWS test script: ## Container Image -The script uses the container image: `nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04` +The script uses the container image: `nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04` You can override this by setting the `CONTAINER_IMAGE` environment variable: ```bash diff --git a/contrib/aws-efa/aws_job_def.json b/contrib/aws-efa/aws_job_def.json index 5a539fb91..6477a16aa 100644 --- a/contrib/aws-efa/aws_job_def.json +++ b/contrib/aws-efa/aws_job_def.json @@ -15,7 +15,7 @@ "imagePullSecrets": [], "containers": [ { - "image": "nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04", + "image": "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04", "command": [ "/bin/bash", "-c", diff --git a/contrib/aws-efa/aws_test.sh b/contrib/aws-efa/aws_test.sh index 44694e08f..0477757a7 100755 --- a/contrib/aws-efa/aws_test.sh +++ b/contrib/aws-efa/aws_test.sh @@ -30,7 +30,7 @@ usage() { echo " GITHUB_REPOSITORY - GitHub repository (e.g., \"ai-dynamo/nixl\")" echo "" echo "Optional environment variables:" - echo " CONTAINER_IMAGE - Container image to use (default: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04)" + echo " CONTAINER_IMAGE - Container image to use (default: nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04)" echo " TEST_TIMEOUT - Timeout for test execution in minutes" exit 1 } @@ -47,7 +47,7 @@ if [ -z "$GITHUB_REF" ] || [ -z "$GITHUB_SERVER_URL" ] || [ -z "$GITHUB_REPOSITO fi test_cmd="$1" -export CONTAINER_IMAGE=${CONTAINER_IMAGE:-"nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04"} +export CONTAINER_IMAGE=${CONTAINER_IMAGE:-"nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04"} # Set Git checkout command based on GITHUB_REF case "$GITHUB_REF" in From 45e0116e1c1152520d2a6c6dc3077bcf0d77ec4f Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Thu, 13 Nov 2025 14:50:05 +0200 Subject: [PATCH 15/15] Add CUDA12 image to the build matrix Some tests (e.g. gpunetio) only run on specific CUDA ver. Adding both CUDA 12 and 13 improves coverage. Signed-off-by: Alexey Rivkin --- .ci/jenkins/lib/build-matrix.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci/jenkins/lib/build-matrix.yaml b/.ci/jenkins/lib/build-matrix.yaml index 805c45bd3..941f021e6 100644 --- a/.ci/jenkins/lib/build-matrix.yaml +++ b/.ci/jenkins/lib/build-matrix.yaml @@ -34,7 +34,8 @@ kubernetes: requests: "{memory: 8Gi, cpu: 8000m}" runs_on_dockers: - - { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04" } + - { name: "ubuntu24.04-cuda12-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" } + - { name: "ubuntu24.04-cuda13-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04" } - { name: "ubuntu22.04-cuda-dl-base", url: "nvidia/cuda:13.0.1-devel-ubuntu22.04" } - { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true }