Skip to content

Commit f6e2edd

Browse files
committed
CI consolidation: Test matrix
Signed-off-by: Alexey Rivkin <[email protected]>
1 parent 366d987 commit f6e2edd

File tree

1 file changed

+46
-73
lines changed

1 file changed

+46
-73
lines changed

.ci/jenkins/lib/test-matrix.yaml

Lines changed: 46 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -17,124 +17,97 @@
1717

1818
job: nixl-ci-test
1919

20-
# Fail job if one of the steps fails or continue
21-
failFast: false
20+
registry_host: harbor.mellanox.com
21+
registry_auth: nixl_harbor_credentials
22+
registry_path: /nixl
2223

24+
failFast: false
2325
timeout_minutes: 240
2426

25-
# label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file
2627
runs_on_agents:
2728
- {nodeLabel: 'H100'}
2829
# - {nodeLabel: 'DGX'}
2930

31+
runs_on_dockers:
32+
- {
33+
file: "contrib/Dockerfile",
34+
name: "ubuntu24.04-nixl-base",
35+
uri: "$arch/$name",
36+
tag: "20251103",
37+
build_args: "--target nixl-base --build-arg ARCH=$arch",
38+
nodeLabel: "H100"
39+
}
40+
3041
matrix:
3142
axes:
32-
image:
33-
- nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04
3443
arch:
3544
- x86_64
3645
ucx_version:
3746
- master
3847
- v1.19.0
3948

40-
taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}"
41-
4249
env:
43-
CONTAINER_WORKSPACE: /workspace
44-
INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install
45-
# Manual timeout - ci-demo doesn't handle docker exec
50+
NIXL_BASE_IMAGE_ENV: "true"
51+
NIXL_INSTALL_DIR: /opt/nixl
4652
TEST_TIMEOUT: 30
47-
# NPROC for bare-metal: containers see all host CPUs, need to limit parallelism
53+
UCX_TLS: "^shm"
4854
NPROC: 16
4955

56+
pipeline_start:
57+
run: |
58+
# Setup user environment (replaces Dockerfile.gpu_test)
59+
if ! getent group "30" > /dev/null 2>&1; then
60+
sudo groupadd -g 30 hardware
61+
fi
62+
sudo useradd -u 148069 -g 30 -m -s /bin/bash svc-nixl || true
63+
echo "svc-nixl ALL=(ALL) NOPASSWD: ALL" | sudo tee /etc/sudoers.d/svc-nixl
64+
sudo chmod 440 /etc/sudoers.d/svc-nixl
65+
sudo mkdir -p /workspace && sudo chmod 777 /workspace
66+
5067
steps:
5168
- name: Get Environment Info
5269
parallel: false
5370
run: |
5471
set +ex
55-
# print kernel version
5672
uname -r
57-
# print ofed info
58-
ofed_info -s
59-
# print nvidia drivers info
60-
lsmod | grep nvidia_peermem
61-
lsmod | grep gdrdrv
62-
lsmod | grep nvidia_fs
63-
# print nvidia-smi
64-
nvidia-smi
65-
nvidia-smi topo -m
66-
# print MPS info
67-
pgrep -a mps
68-
# print compute mode
69-
nvidia-smi -q | grep -i "compute mode"
70-
# check rdma status
71-
ibv_devinfo
72-
#ib_write_bw
73-
74-
75-
- name: Build GPU Test Environment
76-
parallel: false
77-
run: |
78-
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} --build-arg WORKSPACE=${CONTAINER_WORKSPACE} .
79-
onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
80-
81-
- name: Run GPU Test Environment
82-
parallel: false
83-
run: |
84-
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
85-
--ulimit memlock=-1:-1 \
86-
--network=host \
87-
--ipc=host \
88-
--cap-add=SYS_PTRACE \
89-
--gpus all \
90-
--device=/dev/infiniband \
91-
--device=/dev/gdrdrv \
92-
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
93-
onfail: |
94-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
95-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
73+
ofed_info -s || true
74+
lsmod | grep nvidia_peermem || true
75+
lsmod | grep gdrdrv || true
76+
lsmod | grep nvidia_fs || true
77+
nvidia-smi || true
78+
nvidia-smi topo -m || true
79+
pgrep -a mps || true
80+
nvidia-smi -q | grep -i "compute mode" || true
81+
ibv_devinfo || true
9682
9783
- name: Build
9884
parallel: false
9985
run: |
100-
set -ex
101-
docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}"
102-
103-
onfail: |
104-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
105-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
86+
UCX_VERSION=${ucx_version} .gitlab/build.sh ${NIXL_INSTALL_DIR}
10687
10788
- name: Test CPP
10889
parallel: false
90+
timeout: "${TEST_TIMEOUT}"
10991
run: |
110-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
111-
onfail: |
112-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
113-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
92+
.gitlab/test_cpp.sh ${NIXL_INSTALL_DIR}
11493
11594
- name: Test Python
11695
parallel: false
96+
timeout: "${TEST_TIMEOUT}"
11797
run: |
118-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
119-
onfail: |
120-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
121-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
98+
.gitlab/test_python.sh ${NIXL_INSTALL_DIR}
12299
123100
- name: Test Nixlbench
124101
parallel: false
102+
timeout: "${TEST_TIMEOUT}"
125103
run: |
126-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}"
127-
onfail: |
128-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
129-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
104+
.gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR}
130105
131106
- name: Test Rust
132107
parallel: false
108+
timeout: "${TEST_TIMEOUT}"
133109
run: |
134-
timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}"
135-
always: |
136-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
137-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
110+
.gitlab/test_rust.sh ${NIXL_INSTALL_DIR}
138111
139112
# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
140113
# https://github.com/Mellanox/ci-demo/pull/111

0 commit comments

Comments
 (0)