|
17 | 17 |
|
18 | 18 | job: nixl-ci-test |
19 | 19 |
|
20 | | -# Fail job if one of the steps fails or continue |
21 | | -failFast: false |
| 20 | +registry_host: harbor.mellanox.com |
| 21 | +registry_auth: nixl_harbor_credentials |
| 22 | +registry_path: /nixl |
22 | 23 |
|
| 24 | +failFast: false |
23 | 25 | timeout_minutes: 240 |
24 | 26 |
|
25 | | -# label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file |
26 | 27 | runs_on_agents: |
27 | 28 | - {nodeLabel: 'H100'} |
28 | 29 | # - {nodeLabel: 'DGX'} |
29 | 30 |
|
| 31 | +runs_on_dockers: |
| 32 | + - { |
| 33 | + file: "contrib/Dockerfile", |
| 34 | + name: "ubuntu24.04-nixl-base", |
| 35 | + uri: "$arch/$name", |
| 36 | + tag: "20251103", |
| 37 | + build_args: "--target nixl-base --build-arg ARCH=$arch", |
| 38 | + nodeLabel: "H100" |
| 39 | + } |
| 40 | + |
30 | 41 | matrix: |
31 | 42 | axes: |
32 | | - image: |
33 | | - - nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 |
34 | 43 | arch: |
35 | 44 | - x86_64 |
36 | 45 | ucx_version: |
37 | 46 | - master |
38 | 47 | - v1.19.0 |
39 | 48 |
|
40 | | -taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}" |
41 | | - |
42 | 49 | env: |
43 | | - CONTAINER_WORKSPACE: /workspace |
44 | | - INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install |
45 | | - # Manual timeout - ci-demo doesn't handle docker exec |
| 50 | + NIXL_BASE_IMAGE_ENV: "true" |
| 51 | + NIXL_INSTALL_DIR: /opt/nixl |
46 | 52 | TEST_TIMEOUT: 30 |
47 | | - # NPROC for bare-metal: containers see all host CPUs, need to limit parallelism |
| 53 | + UCX_TLS: "^shm" |
48 | 54 | NPROC: 16 |
49 | 55 |
|
| 56 | +pipeline_start: |
| 57 | + run: | |
| 58 | + # Setup user environment (replaces Dockerfile.gpu_test) |
| 59 | + if ! getent group "30" > /dev/null 2>&1; then |
| 60 | + sudo groupadd -g 30 hardware |
| 61 | + fi |
| 62 | + sudo useradd -u 148069 -g 30 -m -s /bin/bash svc-nixl || true |
| 63 | + echo "svc-nixl ALL=(ALL) NOPASSWD: ALL" | sudo tee /etc/sudoers.d/svc-nixl |
| 64 | + sudo chmod 440 /etc/sudoers.d/svc-nixl |
| 65 | + sudo mkdir -p /workspace && sudo chmod 777 /workspace |
| 66 | +
|
50 | 67 | steps: |
51 | 68 | - name: Get Environment Info |
52 | 69 | parallel: false |
53 | 70 | run: | |
54 | 71 | set +ex |
55 | | - # print kernel version |
56 | 72 | uname -r |
57 | | - # print ofed info |
58 | | - ofed_info -s |
59 | | - # print nvidia drivers info |
60 | | - lsmod | grep nvidia_peermem |
61 | | - lsmod | grep gdrdrv |
62 | | - lsmod | grep nvidia_fs |
63 | | - # print nvidia-smi |
64 | | - nvidia-smi |
65 | | - nvidia-smi topo -m |
66 | | - # print MPS info |
67 | | - pgrep -a mps |
68 | | - # print compute mode |
69 | | - nvidia-smi -q | grep -i "compute mode" |
70 | | - # check rdma status |
71 | | - ibv_devinfo |
72 | | - #ib_write_bw |
73 | | -
|
74 | | -
|
75 | | - - name: Build GPU Test Environment |
76 | | - parallel: false |
77 | | - run: | |
78 | | - docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} --build-arg WORKSPACE=${CONTAINER_WORKSPACE} . |
79 | | - onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
80 | | - |
81 | | - - name: Run GPU Test Environment |
82 | | - parallel: false |
83 | | - run: | |
84 | | - docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \ |
85 | | - --ulimit memlock=-1:-1 \ |
86 | | - --network=host \ |
87 | | - --ipc=host \ |
88 | | - --cap-add=SYS_PTRACE \ |
89 | | - --gpus all \ |
90 | | - --device=/dev/infiniband \ |
91 | | - --device=/dev/gdrdrv \ |
92 | | - "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
93 | | - onfail: | |
94 | | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
95 | | - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
| 73 | + ofed_info -s || true |
| 74 | + lsmod | grep nvidia_peermem || true |
| 75 | + lsmod | grep gdrdrv || true |
| 76 | + lsmod | grep nvidia_fs || true |
| 77 | + nvidia-smi || true |
| 78 | + nvidia-smi topo -m || true |
| 79 | + pgrep -a mps || true |
| 80 | + nvidia-smi -q | grep -i "compute mode" || true |
| 81 | + ibv_devinfo || true |
96 | 82 |
|
97 | 83 | - name: Build |
98 | 84 | parallel: false |
99 | 85 | run: | |
100 | | - set -ex |
101 | | - docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c "UCX_VERSION=${ucx_version} .gitlab/build.sh ${INSTALL_DIR}" |
102 | | -
|
103 | | - onfail: | |
104 | | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
105 | | - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
| 86 | + UCX_VERSION=${ucx_version} .gitlab/build.sh ${NIXL_INSTALL_DIR} |
106 | 87 |
|
107 | 88 | - name: Test CPP |
108 | 89 | parallel: false |
| 90 | + timeout: "${TEST_TIMEOUT}" |
109 | 91 | run: | |
110 | | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}" |
111 | | - onfail: | |
112 | | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
113 | | - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
| 92 | + .gitlab/test_cpp.sh ${NIXL_INSTALL_DIR} |
114 | 93 |
|
115 | 94 | - name: Test Python |
116 | 95 | parallel: false |
| 96 | + timeout: "${TEST_TIMEOUT}" |
117 | 97 | run: | |
118 | | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}" |
119 | | - onfail: | |
120 | | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
121 | | - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
| 98 | + .gitlab/test_python.sh ${NIXL_INSTALL_DIR} |
122 | 99 |
|
123 | 100 | - name: Test Nixlbench |
124 | 101 | parallel: false |
| 102 | + timeout: "${TEST_TIMEOUT}" |
125 | 103 | run: | |
126 | | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_nixlbench.sh ${INSTALL_DIR}" |
127 | | - onfail: | |
128 | | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
129 | | - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
| 104 | + .gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR} |
130 | 105 |
|
131 | 106 | - name: Test Rust |
132 | 107 | parallel: false |
| 108 | + timeout: "${TEST_TIMEOUT}" |
133 | 109 | run: | |
134 | | - timeout ${TEST_TIMEOUT}m docker exec -w ${CONTAINER_WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_rust.sh ${INSTALL_DIR}" |
135 | | - always: | |
136 | | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
137 | | - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" |
| 110 | + .gitlab/test_rust.sh ${NIXL_INSTALL_DIR} |
138 | 111 |
|
139 | 112 | # once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step |
140 | 113 | # https://github.com/Mellanox/ci-demo/pull/111 |
|
0 commit comments