Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions .ci/jenkins/lib/build-matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
---
job: nixl-ci-build

registry_host: harbor.mellanox.com
registry_auth: nixl_harbor_credentials
registry_path: /nixl

# Fail job if one of the steps fails or continue
failFast: false

Expand All @@ -34,8 +38,20 @@ kubernetes:
requests: "{memory: 8Gi, cpu: 8000m}"

runs_on_dockers:
- { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" }
- { name: "ubuntu22.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:24.10-cuda12.6-devel-ubuntu22.04" }
- {
file: "contrib/Dockerfile",
name: "ubuntu24.04-nixl-base",
uri: "$arch/$name",
tag: "20251103",
build_args: "--target nixl-base --build-arg ARCH=$arch"
}
- {
file: "contrib/Dockerfile",
name: "ubuntu22.04-nixl-base",
uri: "$arch/$name",
tag: "20251103",
build_args: "--target nixl-base --build-arg ARCH=$arch --build-arg BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04"
}
- { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true }

matrix:
Expand All @@ -45,6 +61,7 @@ matrix:
- aarch64

env:
NIXL_BASE_IMAGE_ENV: "true"
NIXL_INSTALL_DIR: /opt/nixl
TEST_TIMEOUT: 30
UCX_TLS: "^shm"
Expand Down
108 changes: 35 additions & 73 deletions .ci/jenkins/lib/test-matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,124 +17,86 @@

job: nixl-ci-test

# Fail job if one of the steps fails or continue
failFast: false
registry_host: harbor.mellanox.com
registry_auth: nixl_harbor_credentials
registry_path: /nixl

failFast: false
timeout_minutes: 240

# label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file
runs_on_agents:
- {nodeLabel: 'H100'}
# - {nodeLabel: 'DGX'}

runs_on_dockers:
- {
file: "contrib/Dockerfile",
name: "ubuntu24.04-nixl-base",
uri: "$arch/$name",
tag: "20251103",
build_args: "--target nixl-base --build-arg ARCH=$arch",
nodeLabel: "H100"
}

matrix:
axes:
image:
- nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04
arch:
- x86_64
ucx_version:
- master
- v1.19.0

taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}"

env:
CONTAINER_WORKSPACE: /workspace
INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install
# Manual timeout - ci-demo doesn't handle docker exec
NIXL_BASE_IMAGE_ENV: "true"
NIXL_INSTALL_DIR: /opt/nixl
TEST_TIMEOUT: 30
# NPROC for bare-metal: containers see all host CPUs, need to limit parallelism
UCX_TLS: "^shm"
NPROC: 16

steps:
- name: Get Environment Info
parallel: false
run: |
set +ex
# print kernel version
uname -r
# print ofed info
ofed_info -s
# print nvidia drivers info
lsmod | grep nvidia_peermem
lsmod | grep gdrdrv
lsmod | grep nvidia_fs
# print nvidia-smi
nvidia-smi
nvidia-smi topo -m
# print MPS info
pgrep -a mps
# print compute mode
nvidia-smi -q | grep -i "compute mode"
# check rdma status
ibv_devinfo
#ib_write_bw


- name: Build GPU Test Environment
parallel: false
run: |
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} --build-arg WORKSPACE=${CONTAINER_WORKSPACE} .
onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"

- name: Run GPU Test Environment
parallel: false
run: |
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
--ulimit memlock=-1:-1 \
--network=host \
--ipc=host \
--cap-add=SYS_PTRACE \
--gpus all \
--device=/dev/infiniband \
--device=/dev/gdrdrv \
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
onfail: |
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
ofed_info -s || true
lsmod | grep nvidia_peermem || true
lsmod | grep gdrdrv || true
lsmod | grep nvidia_fs || true
nvidia-smi || true
nvidia-smi topo -m || true
pgrep -a mps || true
nvidia-smi -q | grep -i "compute mode" || true
ibv_devinfo || true

- name: Build
parallel: false
run: |
set -ex
docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}"

onfail: |
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
UCX_VERSION=${ucx_version} .gitlab/build.sh ${NIXL_INSTALL_DIR}

- name: Test CPP
parallel: false
timeout: "${TEST_TIMEOUT}"
run: |
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
onfail: |
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
.gitlab/test_cpp.sh ${NIXL_INSTALL_DIR}

- name: Test Python
parallel: false
timeout: "${TEST_TIMEOUT}"
run: |
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
onfail: |
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
.gitlab/test_python.sh ${NIXL_INSTALL_DIR}

- name: Test Nixlbench
parallel: false
timeout: "${TEST_TIMEOUT}"
run: |
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}"
onfail: |
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
.gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR}

- name: Test Rust
parallel: false
timeout: "${TEST_TIMEOUT}"
run: |
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}"
always: |
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
.gitlab/test_rust.sh ${NIXL_INSTALL_DIR}

# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
# https://github.com/Mellanox/ci-demo/pull/111
Expand Down
72 changes: 40 additions & 32 deletions .gitlab/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ fi
ARCH=$(uname -m)
[ "$ARCH" = "arm64" ] && ARCH="aarch64"

# Skip dependency installation if running in pre-built nixl-base image
if [ -n "${NIXL_BASE_IMAGE_ENV}" ]; then
# Use pre-installed libfabric from base image
LIBFABRIC_INSTALL_DIR=/usr/local
else

# Some docker images are with broken installations:
$SUDO rm -rf /usr/lib/cmake/grpc /usr/lib/cmake/protobuf

Expand Down Expand Up @@ -103,17 +109,6 @@ $SUDO apt-get -qq install -y python3-dev \
libhwloc-dev \
libcurl4-openssl-dev zlib1g-dev # aws-sdk-cpp dependencies

# Ubuntu 22.04 specific setup
if grep -q "Ubuntu 22.04" /etc/os-release 2>/dev/null; then
# Upgrade pip for '--break-system-packages' support
$SUDO pip3 install --upgrade pip

# Upgrade meson (distro version 0.61.2 is too old, project requires >= 0.64.0)
$SUDO pip3 install --upgrade meson
# Ensure pip3's meson takes precedence over apt's version
export PATH="$HOME/.local/bin:/usr/local/bin:$PATH"
fi

# Add DOCA repository and install packages
ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi)
MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)"
Expand All @@ -140,27 +135,6 @@ chmod +x install_uv.sh
./install_uv.sh
export PATH="$HOME/.local/bin:$PATH"

curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz
( \
cd openucx-ucx* && \
./autogen.sh && \
./configure \
--prefix="${UCX_INSTALL_DIR}" \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-verbs \
--with-dm \
${UCX_CUDA_BUILD_ARGS} \
--enable-mt && \
make -j && \
make -j install-strip && \
$SUDO ldconfig \
)

wget --tries=3 --waitretry=5 -O "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
tar xjf "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
Expand Down Expand Up @@ -215,6 +189,40 @@ rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
cp gtest-parallel/* ${INSTALL_DIR}/bin/
)

fi # end NIXL_BASE_IMAGE_ENV check

# Build UCX
curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz
( \
cd openucx-ucx* && \
./autogen.sh && \
./configure \
--prefix="${UCX_INSTALL_DIR}" \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-verbs \
--with-dm \
${UCX_CUDA_BUILD_ARGS} \
--enable-mt && \
make -j && \
make -j install-strip && \
$SUDO ldconfig \
)

# Ubuntu 22.04 specific setup
if grep -q "Ubuntu 22.04" /etc/os-release 2>/dev/null; then
# Upgrade pip for '--break-system-packages' support
$SUDO pip3 install --upgrade pip
# Upgrade meson (distro version 0.61.2 is too old, project requires >= 0.64.0)
$SUDO pip3 install --upgrade meson
# Ensure pip3's meson takes precedence over apt's version
export PATH="$HOME/.local/bin:/usr/local/bin:$PATH"
fi

export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:${LIBFABRIC_INSTALL_DIR}/lib"
export CPATH="${INSTALL_DIR}/include:${LIBFABRIC_INSTALL_DIR}/include:$CPATH"
export PATH="${INSTALL_DIR}/bin:$PATH"
Expand Down
Loading