diff --git a/.ci/jenkins/lib/build-matrix.yaml b/.ci/jenkins/lib/build-matrix.yaml index 1ea9b3637..898ad006a 100644 --- a/.ci/jenkins/lib/build-matrix.yaml +++ b/.ci/jenkins/lib/build-matrix.yaml @@ -22,6 +22,10 @@ --- job: nixl-ci-build +registry_host: harbor.mellanox.com +registry_auth: nixl_harbor_credentials +registry_path: /nixl + # Fail job if one of the steps fails or continue failFast: false @@ -34,8 +38,20 @@ kubernetes: requests: "{memory: 8Gi, cpu: 8000m}" runs_on_dockers: - - { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" } - - { name: "ubuntu22.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:24.10-cuda12.6-devel-ubuntu22.04" } + - { + file: "contrib/Dockerfile", + name: "ubuntu24.04-nixl-base", + uri: "$arch/$name", + tag: "20251103", + build_args: "--target nixl-base --build-arg ARCH=$arch" + } + - { + file: "contrib/Dockerfile", + name: "ubuntu22.04-nixl-base", + uri: "$arch/$name", + tag: "20251103", + build_args: "--target nixl-base --build-arg ARCH=$arch --build-arg BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04" + } - { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true } matrix: @@ -45,6 +61,7 @@ matrix: - aarch64 env: + NIXL_BASE_IMAGE_ENV: "true" NIXL_INSTALL_DIR: /opt/nixl TEST_TIMEOUT: 30 UCX_TLS: "^shm" diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 34836cb2f..5816f3d4e 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -17,34 +17,40 @@ job: nixl-ci-test -# Fail job if one of the steps fails or continue -failFast: false +registry_host: harbor.mellanox.com +registry_auth: nixl_harbor_credentials +registry_path: /nixl +failFast: false timeout_minutes: 240 -# label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file runs_on_agents: - {nodeLabel: 'H100'} # - {nodeLabel: 'DGX'} +runs_on_dockers: + - { + file: "contrib/Dockerfile", + name: "ubuntu24.04-nixl-base", + uri: "$arch/$name", + tag: "20251103", + build_args: "--target nixl-base --build-arg ARCH=$arch", + nodeLabel: "H100" + } + matrix: axes: - image: - - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 arch: - x86_64 ucx_version: - master - v1.19.0 -taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}" - env: - CONTAINER_WORKSPACE: /workspace - INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install - # Manual timeout - ci-demo doesn't handle docker exec + NIXL_BASE_IMAGE_ENV: "true" + NIXL_INSTALL_DIR: /opt/nixl TEST_TIMEOUT: 30 - # NPROC for bare-metal: containers see all host CPUs, need to limit parallelism + UCX_TLS: "^shm" NPROC: 16 steps: @@ -52,89 +58,45 @@ steps: parallel: false run: | set +ex - # print kernel version uname -r - # print ofed info - ofed_info -s - # print nvidia drivers info - lsmod | grep nvidia_peermem - lsmod | grep gdrdrv - lsmod | grep nvidia_fs - # print nvidia-smi - nvidia-smi - nvidia-smi topo -m - # print MPS info - pgrep -a mps - # print compute mode - nvidia-smi -q | grep -i "compute mode" - # check rdma status - ibv_devinfo - #ib_write_bw - - - - name: Build GPU Test Environment - parallel: false - run: | - docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} --build-arg WORKSPACE=${CONTAINER_WORKSPACE} . - onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - - - name: Run GPU Test Environment - parallel: false - run: | - docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \ - --ulimit memlock=-1:-1 \ - --network=host \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --gpus all \ - --device=/dev/infiniband \ - --device=/dev/gdrdrv \ - "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - onfail: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + ofed_info -s || true + lsmod | grep nvidia_peermem || true + lsmod | grep gdrdrv || true + lsmod | grep nvidia_fs || true + nvidia-smi || true + nvidia-smi topo -m || true + pgrep -a mps || true + nvidia-smi -q | grep -i "compute mode" || true + ibv_devinfo || true - name: Build parallel: false run: | - set -ex - docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}" - - onfail: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + UCX_VERSION=${ucx_version} .gitlab/build.sh ${NIXL_INSTALL_DIR} - name: Test CPP parallel: false + timeout: "${TEST_TIMEOUT}" run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}" - onfail: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + .gitlab/test_cpp.sh ${NIXL_INSTALL_DIR} - name: Test Python parallel: false + timeout: "${TEST_TIMEOUT}" run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}" - onfail: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + .gitlab/test_python.sh ${NIXL_INSTALL_DIR} - name: Test Nixlbench parallel: false + timeout: "${TEST_TIMEOUT}" run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}" - onfail: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + .gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR} - name: Test Rust parallel: false + timeout: "${TEST_TIMEOUT}" run: | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}" - always: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + .gitlab/test_rust.sh ${NIXL_INSTALL_DIR} # once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step # https://github.com/Mellanox/ci-demo/pull/111 diff --git a/.gitlab/build.sh b/.gitlab/build.sh index 60012c7fe..391e2d93e 100755 --- a/.gitlab/build.sh +++ b/.gitlab/build.sh @@ -53,6 +53,12 @@ fi ARCH=$(uname -m) [ "$ARCH" = "arm64" ] && ARCH="aarch64" +# Skip dependency installation if running in pre-built nixl-base image +if [ -n "${NIXL_BASE_IMAGE_ENV}" ]; then + # Use pre-installed libfabric from base image + LIBFABRIC_INSTALL_DIR=/usr/local +else + # Some docker images are with broken installations: $SUDO rm -rf /usr/lib/cmake/grpc /usr/lib/cmake/protobuf @@ -103,17 +109,6 @@ $SUDO apt-get -qq install -y python3-dev \ libhwloc-dev \ libcurl4-openssl-dev zlib1g-dev # aws-sdk-cpp dependencies -# Ubuntu 22.04 specific setup -if grep -q "Ubuntu 22.04" /etc/os-release 2>/dev/null; then - # Upgrade pip for '--break-system-packages' support - $SUDO pip3 install --upgrade pip - - # Upgrade meson (distro version 0.61.2 is too old, project requires >= 0.64.0) - $SUDO pip3 install --upgrade meson - # Ensure pip3's meson takes precedence over apt's version - export PATH="$HOME/.local/bin:/usr/local/bin:$PATH" -fi - # Add DOCA repository and install packages ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)" @@ -140,27 +135,6 @@ chmod +x install_uv.sh ./install_uv.sh export PATH="$HOME/.local/bin:$PATH" -curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz -( \ - cd openucx-ucx* && \ - ./autogen.sh && \ - ./configure \ - --prefix="${UCX_INSTALL_DIR}" \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-verbs \ - --with-dm \ - ${UCX_CUDA_BUILD_ARGS} \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - $SUDO ldconfig \ -) - wget --tries=3 --waitretry=5 -O "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" tar xjf "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" @@ -215,6 +189,40 @@ rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" cp gtest-parallel/* ${INSTALL_DIR}/bin/ ) +fi # end NIXL_BASE_IMAGE_ENV check + +# Build UCX +curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz +( \ + cd openucx-ucx* && \ + ./autogen.sh && \ + ./configure \ + --prefix="${UCX_INSTALL_DIR}" \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-verbs \ + --with-dm \ + ${UCX_CUDA_BUILD_ARGS} \ + --enable-mt && \ + make -j && \ + make -j install-strip && \ + $SUDO ldconfig \ +) + +# Ubuntu 22.04 specific setup +if grep -q "Ubuntu 22.04" /etc/os-release 2>/dev/null; then + # Upgrade pip for '--break-system-packages' support + $SUDO pip3 install --upgrade pip + # Upgrade meson (distro version 0.61.2 is too old, project requires >= 0.64.0) + $SUDO pip3 install --upgrade meson + # Ensure pip3's meson takes precedence over apt's version + export PATH="$HOME/.local/bin:/usr/local/bin:$PATH" +fi + export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:${LIBFABRIC_INSTALL_DIR}/lib" export CPATH="${INSTALL_DIR}/include:${LIBFABRIC_INSTALL_DIR}/include:$CPATH" export PATH="${INSTALL_DIR}/bin:$PATH" diff --git a/contrib/Dockerfile b/contrib/Dockerfile index 07619cce9..5383b8e5b 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -17,7 +17,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" ARG OS -FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} +FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS nixl-base # Set default OS if not provided ARG OS=${OS:-ubuntu24} @@ -58,44 +58,73 @@ RUN apt-get update -y && \ flex \ libgtest-dev \ build-essential \ - python3.12-dev \ clang \ hwloc \ libhwloc-dev \ - libcurl4-openssl-dev libssl-dev uuid-dev zlib1g-dev # aws-sdk-cpp dependencies + libcurl4-openssl-dev libssl-dev uuid-dev zlib1g-dev \ + python3-dev \ + python3-pip \ + curl \ + wget \ + numactl \ + git \ + libiberty-dev \ + libgoogle-glog-dev \ + libgmock-dev \ + libjsoncpp-dev \ + libpython3-dev \ + libboost-all-dev \ + meson \ + pkg-config \ + pybind11-dev \ + net-tools \ + iproute2 \ + pciutils \ + libpci-dev \ + libibmad-dev \ + doxygen && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* # Add DOCA repository and install packages RUN ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \ MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)" && \ wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host_3.1.0-091000-25.07-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb && \ dpkg -i doca-host.deb && \ + rm -f doca-host.deb && \ apt-get update && \ apt-get upgrade -y && \ apt-get install -y --no-install-recommends doca-sdk-gpunetio libdoca-sdk-gpunetio-dev libdoca-sdk-verbs-dev + # no cleanup, next step needs the repo # Force reinstall of RDMA packages from DOCA repository # Reinstall needed to fix broken libibverbs-dev, which may lead to lack of Infiniband support. # Upgrade is not sufficient if the version is the same since apt skips the installation. RUN DEBIAN_FRONTEND=noninteractive apt-get -y install \ --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \ - libnuma-dev librdmacm-dev ibverbs-providers + libnuma-dev librdmacm-dev ibverbs-providers && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* WORKDIR /workspace RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \ cd etcd-cpp-apiv3 && \ sed -i '/^find_dependency(cpprestsdk)$/d' etcd-cpp-api-config.in.cmake && \ mkdir build && cd build && \ - cmake .. -DBUILD_ETCD_CORE_ONLY=ON -DCMAKE_BUILD_TYPE=Release && make -j${NPROC:-$(nproc)} && make install + cmake .. -DBUILD_ETCD_CORE_ONLY=ON -DCMAKE_BUILD_TYPE=Release && make -j${NPROC:-$(nproc)} && make install && \ + cd /workspace && rm -rf etcd-cpp-apiv3 RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ mkdir aws_sdk_build && cd aws_sdk_build && \ cmake ../aws-sdk-cpp/ -DCMAKE_BUILD_TYPE=Release -DBUILD_ONLY="s3" -DENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX=/usr/local && \ - make -j${NPROC:-$(nproc)} && make install + make -j${NPROC:-$(nproc)} && make install && \ + cd /workspace && rm -rf aws-sdk-cpp aws_sdk_build -RUN git clone https://github.com/nvidia/gusli.git && \ +RUN git clone --depth 1 https://github.com/nvidia/gusli.git && \ cd gusli && \ make all BUILD_RELEASE=1 BUILD_FOR_UNITEST=0 VERBOSE=1 ALLOW_USE_URING=0 && \ - cd .. + cd .. && \ + rm -rf gusli COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ @@ -115,35 +144,17 @@ RUN wget --tries=3 --waitretry=5 \ rm rustup-init* && \ chmod -R a+w $RUSTUP_HOME $CARGO_HOME -RUN rm -rf /usr/lib/ucx -RUN rm -rf /opt/hpcx/ucx - -RUN cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout $UCX_REF && \ - ./autogen.sh && ./configure \ - --prefix=$UCX_PREFIX \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --with-efa \ - --enable-mt && \ - make -j${NPROC:-$(nproc)} && \ - make -j${NPROC:-$(nproc)} install-strip && \ - ldconfig +# Clean up old UCX installations to avoid conflicts +RUN rm -rf /usr/lib/ucx /opt/hpcx/ucx /usr/local/ucx && \ + ARCH=$(uname -m) && \ + rm -f /usr/lib/${ARCH}-linux-gnu/libucs* /usr/lib/${ARCH}-linux-gnu/libucp* \ + /usr/lib/${ARCH}-linux-gnu/libuct* /usr/lib/${ARCH}-linux-gnu/libucm* 2>/dev/null || true RUN cd /tmp && \ git clone --depth 1 https://github.com/google/gtest-parallel.git && \ mkdir -p /usr/local/bin && \ - cp gtest-parallel/gtest-parallel gtest-parallel/gtest_parallel.py /usr/local/bin/ + cp gtest-parallel/gtest-parallel gtest-parallel/gtest_parallel.py /usr/local/bin/ && \ + rm -rf gtest-parallel ENV PATH=/usr/local/bin:$PATH # Build libfabric from source @@ -165,7 +176,22 @@ RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ --enable-gdrcopy-dlopen && \ make -j${NPROC:-$(nproc)} && \ make install && \ - ldconfig + ldconfig && \ + cd /tmp && rm -rf libfabric-* + +# Stage 2: Build NIXL (default stage) +FROM nixl-base + +# Re-declare ARGs needed in this stage +ARG ARCH="x86_64" +ARG NIXL_PREFIX="/usr/local/nixl" +ARG NIXL_PLUGIN_DIR="$NIXL_PREFIX/lib/$ARCH-linux-gnu/plugins" +ARG DEFAULT_PYTHON_VERSION="3.12" +ARG UCX_REF="v1.19.0" +ARG UCX_PREFIX="/usr" +ARG UCX_PLUGIN_DIR="$UCX_PREFIX/lib/ucx" +ARG LIBFABRIC_INSTALL_PATH="/usr/local" +ARG NPROC # By default, uv downloads python packages to $HOME/.cache/uv and hard links them # from the virtual environment. This means that the files reside in /root/.cache/uv, @@ -190,14 +216,35 @@ RUN uv pip install --upgrade meson meson-python pybind11 patchelf pyYAML click t RUN export UV_INDEX="https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \ uv pip install torch torchvision torchaudio +# Build UCX in Stage 2 for build-container.sh (CI uses build.sh to build UCX) +RUN cd /usr/local/src && \ + git clone https://github.com/openucx/ucx.git && \ + cd ucx && \ + git checkout $UCX_REF && \ + ./autogen.sh && ./configure \ + --prefix=$UCX_PREFIX \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=/usr/local/cuda \ + --with-verbs \ + --with-dm \ + --with-gdrcopy=/usr/local \ + --with-efa \ + --enable-mt && \ + make -j${NPROC:-$(nproc)} && \ + make -j${NPROC:-$(nproc)} install-strip && \ + ldconfig && \ + cd /usr/local/src && rm -rf ucx + WORKDIR /workspace/nixl COPY . /workspace/nixl ENV LD_LIBRARY_PATH=/usr/local/lib:$LIBFABRIC_INSTALL_PATH/lib:$LD_LIBRARY_PATH -# Install pybind11 via apt -RUN apt-get update && apt-get install -y --no-install-recommends pybind11-dev - ENV NIXL_PREFIX=$NIXL_PREFIX RUN rm -rf build && \ mkdir build && \