diff --git a/.ci/dockerfiles/Dockerfile.gpu_test b/.ci/dockerfiles/Dockerfile.gpu_test index c441bea6b..36fa826f6 100644 --- a/.ci/dockerfiles/Dockerfile.gpu_test +++ b/.ci/dockerfiles/Dockerfile.gpu_test @@ -13,7 +13,7 @@ # docker run --gpus all --privileged -it nixl-gpu-test # # Build arguments: -# BASE_IMAGE: Base NVIDIA cuda-dl-base image (default: nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04) +# BASE_IMAGE: Base NVIDIA cuda-dl-base image (default: nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04) # _UID: User ID for the non-root user (default: 148069) # _GID: Group ID for the user (default: 30) # _LOGIN: Username (default: svc-nixl) @@ -22,7 +22,7 @@ # WORKSPACE: Workspace directory path # -ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 +ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 FROM ${BASE_IMAGE} diff --git a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md index c92469584..f3b59dc8a 100644 --- a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md +++ b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md @@ -137,7 +137,7 @@ sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker ``` -Verify GPU access in containers using `docker run --gpus all nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 nvidia-smi`[^1_3]. +Verify GPU access in containers using `docker run --gpus all nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 nvidia-smi`[^1_3]. ### 9. **Validation and Troubleshooting** diff --git a/.ci/jenkins/lib/build-matrix.yaml b/.ci/jenkins/lib/build-matrix.yaml index c476b7d59..941f021e6 100644 --- a/.ci/jenkins/lib/build-matrix.yaml +++ b/.ci/jenkins/lib/build-matrix.yaml @@ -6,7 +6,7 @@ # Key Components: # - Job Configuration: Defines timeout, failure behavior, and Kubernetes resources # - Docker Images: Specifies the container images used for different build stages -# - cuda-dl-base images (25.06 for Ubuntu 24.04, 24.10 for Ubuntu 22.04) for building and testing +# - cuda-dl-base images (25.10 for Ubuntu 24.04, 13.0.1 for Ubuntu 22.04) for building and testing # - Podman image for container builds # - Matrix Axes: Defines build variations (currently x86_64 architecture) # - Build Steps: Sequential steps for building, testing, and container creation @@ -34,8 +34,9 @@ kubernetes: requests: "{memory: 8Gi, cpu: 8000m}" runs_on_dockers: - - { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" } - - { name: "ubuntu22.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:24.10-cuda12.6-devel-ubuntu22.04" } + - { name: "ubuntu24.04-cuda12-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" } + - { name: "ubuntu24.04-cuda13-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04" } + - { name: "ubuntu22.04-cuda-dl-base", url: "nvidia/cuda:13.0.1-devel-ubuntu22.04" } - { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true } matrix: diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index d9ac4f129..06c15aee9 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -30,6 +30,7 @@ runs_on_agents: matrix: axes: image: + - nvcr.io/nvidia/cuda-dl-base:25.10-cuda13.0-devel-ubuntu24.04 - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 arch: - x86_64 diff --git a/.ci/jenkins/pipeline/proj-jjb.yaml b/.ci/jenkins/pipeline/proj-jjb.yaml index d85c8fc26..740edbb22 100644 --- a/.ci/jenkins/pipeline/proj-jjb.yaml +++ b/.ci/jenkins/pipeline/proj-jjb.yaml @@ -280,7 +280,7 @@ description: "Base Docker image for the container build" - string: name: "BASE_IMAGE_TAG" - default: "25.06-cuda12.9-devel-ubuntu24.04" + default: "25.10-cuda13.0-devel-ubuntu24.04" description: "Tag for the base Docker image" - string: name: "TAG_SUFFIX" @@ -294,7 +294,7 @@ description: > Update the latest tag for this architecture.
When enabled, also creates: <base-image-tag>-<arch>-latest
- Example: 25.06-cuda12.9-devel-ubuntu24.04-aarch64-latest
+ Example: 25.10-cuda13.0-devel-ubuntu24.04-aarch64-latest
- string: name: "MAIL_TO" default: "25f58ae0.NVIDIA.onmicrosoft.com@amer.teams.ms" diff --git a/.gitlab/test_rust.sh b/.gitlab/test_rust.sh index dd8f38682..caafbf999 100755 --- a/.gitlab/test_rust.sh +++ b/.gitlab/test_rust.sh @@ -36,7 +36,7 @@ which cargo cargo --version export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins:/usr/local/lib:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/cuda-12.8/compat:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/cuda-13.0/compat:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH export CPATH=${INSTALL_DIR}/include:$CPATH export PATH=${INSTALL_DIR}/bin:$PATH diff --git a/benchmark/nixlbench/README.md b/benchmark/nixlbench/README.md index fb37927c0..af6657de8 100644 --- a/benchmark/nixlbench/README.md +++ b/benchmark/nixlbench/README.md @@ -65,7 +65,7 @@ A comprehensive benchmarking tool for the NVIDIA Inference Xfer Library (NIXL) t - **Operating System**: Ubuntu 22.04/24.04 LTS (recommended) or RHEL-based - **Docker**: Version 20.10+ (for container builds) - **Git**: For source code management -- **CUDA Toolkit**: 12.8+ (for GPU features) +- **CUDA Toolkit**: 13.0+ (for GPU features) - **Python**: 3.12+ (for benchmark utilities) ## Quick Start @@ -172,7 +172,7 @@ cd nixl/benchmark/nixlbench/contrib | `--ucx ` | Path to custom UCX source (optional) | Uses base image UCX | | `--build-type ` | Build type: `debug` or `release` | `release` | | `--base-image ` | Base Docker image | `nvcr.io/nvidia/cuda-dl-base` | -| `--base-image-tag ` | Base image tag | `25.06-cuda12.9-devel-ubuntu24.04` | +| `--base-image-tag ` | Base image tag | `25.10-cuda13.0-devel-ubuntu24.04` | | `--arch ` | Target architecture: `x86_64` or `aarch64` | Auto-detected | | `--python-versions ` | Python versions (comma-separated) | `3.12` | | `--tag ` | Custom Docker image tag | Auto-generated | @@ -187,7 +187,7 @@ For development environments or when Docker is not available. **Required:** - **NIXL**: Core communication library - **UCX**: Unified Communication X library -- **CUDA**: NVIDIA CUDA Toolkit (≥12.8) +- **CUDA**: NVIDIA CUDA Toolkit (≥13.0) - **CMake**: Build system (≥3.20) - **Meson**: Build system for NIXL/NIXLBench - **Ninja**: Build backend @@ -234,9 +234,9 @@ sudo apt-get reinstall -y --no-install-recommends \ #### CUDA Toolkit Installation ```bash -# Download and install CUDA 12.8 -wget https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_550.54.15_linux.run -sudo sh cuda_12.8.0_550.54.15_linux.run +# Download and install CUDA 13.0 +wget https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run +sudo sh cuda_13.0.2_580.95.05_linux.run # Set environment variables export PATH=/usr/local/cuda/bin:$PATH diff --git a/benchmark/nixlbench/contrib/Dockerfile b/benchmark/nixlbench/contrib/Dockerfile index 2c6ee0e01..fb50f6873 100644 --- a/benchmark/nixlbench/contrib/Dockerfile +++ b/benchmark/nixlbench/contrib/Dockerfile @@ -14,7 +14,7 @@ # limitations under the License. ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" -ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04" # UCX argument is either "upstream" (default installed in base image) or "custom" (build from source) ARG UCX="upstream" diff --git a/benchmark/nixlbench/contrib/build.sh b/benchmark/nixlbench/contrib/build.sh index 2571d366f..dbd2eea33 100755 --- a/benchmark/nixlbench/contrib/build.sh +++ b/benchmark/nixlbench/contrib/build.sh @@ -35,7 +35,7 @@ if [ -z ${latest_tag} ]; then fi BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base -BASE_IMAGE_TAG=25.06-cuda12.9-devel-ubuntu24.04 +BASE_IMAGE_TAG=25.10-cuda13.0-devel-ubuntu24.04 ARCH=$(uname -m) [ "$ARCH" = "arm64" ] && ARCH="aarch64" WHL_BASE=manylinux_2_39 diff --git a/contrib/Dockerfile b/contrib/Dockerfile index aba12ef64..ee34975c9 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -14,7 +14,7 @@ # limitations under the License. ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" -ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04" ARG OS FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} diff --git a/contrib/build-container.sh b/contrib/build-container.sh index 05966595c..9e2c79dec 100755 --- a/contrib/build-container.sh +++ b/contrib/build-container.sh @@ -29,7 +29,7 @@ fi VERSION=v$latest_tag.dev.$commit_id BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base -BASE_IMAGE_TAG=25.06-cuda12.9-devel-ubuntu24.04 +BASE_IMAGE_TAG=25.10-cuda13.0-devel-ubuntu24.04 ARCH=$(uname -m) [ "$ARCH" = "arm64" ] && ARCH="aarch64" WHL_BASE=manylinux_2_39 @@ -148,7 +148,8 @@ get_options() { done if [[ $OS == "ubuntu22" ]]; then - BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04 + BASE_IMAGE=nvidia/cuda + BASE_IMAGE_TAG=13.0.1-devel-ubuntu22.04 WHL_BASE=${WHL_BASE:-manylinux_2_34} fi diff --git a/src/utils/libfabric/libfabric_common.cpp b/src/utils/libfabric/libfabric_common.cpp index 140694e11..ec945cedb 100644 --- a/src/utils/libfabric/libfabric_common.cpp +++ b/src/utils/libfabric/libfabric_common.cpp @@ -50,7 +50,7 @@ getAvailableNetworkDevices() { hints->mode = FI_CONTEXT; hints->ep_attr->type = FI_EP_RDM; - int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + int ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); if (ret) { NIXL_ERROR << "fi_getinfo failed " << fi_strerror(-ret); fi_freeinfo(hints); diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp index f5b155c7b..6f0fa6fd4 100644 --- a/src/utils/libfabric/libfabric_rail.cpp +++ b/src/utils/libfabric/libfabric_rail.cpp @@ -431,7 +431,7 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device, hints->domain_attr->threading = FI_THREAD_SAFE; try { // Get fabric info for this specific device - first try with FI_HMEM - int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + int ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); // If no provider found with FI_HMEM, retry without it if (ret || !info) { @@ -442,7 +442,7 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device, hints->caps = FI_MSG | FI_RMA; hints->caps |= FI_LOCAL_COMM | FI_REMOTE_COMM; - ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); if (ret) { NIXL_ERROR << "fi_getinfo failed for rail " << rail_id << ": " << fi_strerror(-ret); throw std::runtime_error("fi_getinfo failed for rail " + std::to_string(rail_id)); diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp index ff428bc96..bac8c25bb 100644 --- a/src/utils/libfabric/libfabric_topology.cpp +++ b/src/utils/libfabric/libfabric_topology.cpp @@ -381,7 +381,7 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() { // This ensures consistency between device discovery and PCIe mapping hints->fabric_attr->prov_name = strdup(provider_name.c_str()); - int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info); + int ret = fi_getinfo(FI_VERSION(1, 21), NULL, NULL, 0, hints, &info); if (ret) { NIXL_ERROR << "fi_getinfo failed for PCIe mapping with provider " << provider_name << ": " << fi_strerror(-ret);