Skip to content

Commit 383466b

Browse files
committed
CI consolidation: Test matrix
Signed-off-by: Alexey Rivkin <[email protected]>
1 parent 3e0a245 commit 383466b

File tree

1 file changed

+45
-71
lines changed

1 file changed

+45
-71
lines changed

.ci/jenkins/lib/test-matrix.yaml

Lines changed: 45 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -17,124 +17,98 @@
1717

1818
job: nixl-ci-test
1919

20-
# Fail job if one of the steps fails or continue
21-
failFast: false
20+
registry_host: harbor.mellanox.com
21+
registry_auth: nixl_harbor_credentials
22+
registry_path: /nixl
2223

24+
failFast: false
2325
timeout_minutes: 240
2426

25-
# label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file
2627
runs_on_agents:
2728
- {nodeLabel: 'H100'}
2829
# - {nodeLabel: 'DGX'}
2930

31+
runs_on_dockers:
32+
- {
33+
file: "contrib/Dockerfile",
34+
name: "ubuntu24.04-nixl-base",
35+
uri: "$arch/$name",
36+
tag: "20251103",
37+
build_args: "--target nixl-base --build-arg ARCH=$arch",
38+
nodeLabel: "H100"
39+
}
40+
3041
matrix:
3142
axes:
32-
image:
33-
- nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04
3443
arch:
3544
- x86_64
3645
ucx_version:
3746
- master
3847
- v1.19.0
3948

40-
taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}"
41-
4249
env:
43-
CONTAINER_WORKSPACE: /workspace
44-
INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install
45-
# Manual timeout - ci-demo doesn't handle docker exec
50+
NIXL_BASE_IMAGE_ENV: "true"
51+
NIXL_INSTALL_DIR: /opt/nixl
4652
TEST_TIMEOUT: 30
47-
# NPROC for bare-metal: containers see all host CPUs, need to limit parallelism
53+
UCX_TLS: "^shm"
4854
NPROC: 16
4955

5056
steps:
5157
- name: Get Environment Info
5258
parallel: false
5359
run: |
5460
set +ex
55-
# print kernel version
5661
uname -r
57-
# print ofed info
58-
ofed_info -s
59-
# print nvidia drivers info
60-
lsmod | grep nvidia_peermem
61-
lsmod | grep gdrdrv
62-
lsmod | grep nvidia_fs
63-
# print nvidia-smi
64-
nvidia-smi
65-
nvidia-smi topo -m
66-
# print MPS info
67-
pgrep -a mps
68-
# print compute mode
69-
nvidia-smi -q | grep -i "compute mode"
70-
# check rdma status
71-
ibv_devinfo
72-
#ib_write_bw
73-
74-
75-
- name: Build GPU Test Environment
62+
ofed_info -s || true
63+
lsmod | grep nvidia_peermem || true
64+
lsmod | grep gdrdrv || true
65+
lsmod | grep nvidia_fs || true
66+
nvidia-smi || true
67+
nvidia-smi topo -m || true
68+
pgrep -a mps || true
69+
nvidia-smi -q | grep -i "compute mode" || true
70+
ibv_devinfo || true
71+
72+
- name: Setup Build Environment
7673
parallel: false
7774
run: |
78-
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} --build-arg WORKSPACE=${CONTAINER_WORKSPACE} .
79-
onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
80-
81-
- name: Run GPU Test Environment
82-
parallel: false
83-
run: |
84-
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
85-
--ulimit memlock=-1:-1 \
86-
--network=host \
87-
--ipc=host \
88-
--cap-add=SYS_PTRACE \
89-
--gpus all \
90-
--device=/dev/infiniband \
91-
--device=/dev/gdrdrv \
92-
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
93-
onfail: |
94-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
95-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
75+
# Setup user environment (replaces Dockerfile.gpu_test)
76+
groupadd -g 30 hardware || true
77+
useradd -u 148069 -g 30 -m -s /bin/bash svc-nixl || true
78+
apt-get update -qq && apt-get install -y -qq sudo
79+
echo "svc-nixl ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/svc-nixl
80+
chmod 440 /etc/sudoers.d/svc-nixl
81+
mkdir -p /workspace && chmod 777 /workspace
82+
mkdir -p /opt/nixl
9683
9784
- name: Build
9885
parallel: false
9986
run: |
100-
set -ex
101-
docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}"
102-
103-
onfail: |
104-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
105-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
87+
UCX_VERSION=${ucx_version} .gitlab/build.sh ${NIXL_INSTALL_DIR}
10688
10789
- name: Test CPP
10890
parallel: false
91+
timeout: "${TEST_TIMEOUT}"
10992
run: |
110-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
111-
onfail: |
112-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
113-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
93+
.gitlab/test_cpp.sh ${NIXL_INSTALL_DIR}
11494
11595
- name: Test Python
11696
parallel: false
97+
timeout: "${TEST_TIMEOUT}"
11798
run: |
118-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
119-
onfail: |
120-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
121-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
99+
.gitlab/test_python.sh ${NIXL_INSTALL_DIR}
122100
123101
- name: Test Nixlbench
124102
parallel: false
103+
timeout: "${TEST_TIMEOUT}"
125104
run: |
126-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}"
127-
onfail: |
128-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
129-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
105+
.gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR}
130106
131107
- name: Test Rust
132108
parallel: false
109+
timeout: "${TEST_TIMEOUT}"
133110
run: |
134-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}"
135-
always: |
136-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
137-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
111+
.gitlab/test_rust.sh ${NIXL_INSTALL_DIR}
138112
139113
# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
140114
# https://github.com/Mellanox/ci-demo/pull/111

0 commit comments

Comments
 (0)