Skip to content

Commit b2701d6

Browse files
authored
Improve build reliability (#725)
Separates the workflows for building the RAPIDS end user images and the cuVS images. The cuVS images do not depend on the RAPIDS end user images, so they can be built in parallel. This also allows for finer grained retries in case of failures. Also switches to using `rapids-mamba-retry` for installing conda packages. Finally, disables building the `cuvs-bench-datasets` images which are consistently failing (#724) until a better solution than the workaround in #723 is ready. 5adab54 can be reverted to re-enable this. Authors: - Ray Douglass (https://github.com/raydouglass) Approvers: - Jake Awe (https://github.com/AyodeAwe) URL: #725
1 parent 4f0424d commit b2701d6

File tree

10 files changed

+318
-159
lines changed

10 files changed

+318
-159
lines changed
Lines changed: 14 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,6 @@ on:
2424
RAPIDS_VER:
2525
required: true
2626
type: string
27-
BASE_TAG:
28-
required: true
29-
type: string
30-
NOTEBOOKS_TAG:
31-
required: true
32-
type: string
3327
CUVS_BENCH_TAG:
3428
required: true
3529
type: string
@@ -88,38 +82,6 @@ jobs:
8882
with:
8983
driver: docker
9084
endpoint: builders
91-
- name: Build base image
92-
uses: docker/build-push-action@v6
93-
with:
94-
context: context
95-
file: Dockerfile
96-
target: base
97-
push: true
98-
pull: true
99-
build-args: |
100-
CUDA_VER=${{ inputs.CUDA_VER }}
101-
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
102-
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
103-
LINUX_VER=${{ inputs.LINUX_VER }}
104-
PYTHON_VER=${{ inputs.PYTHON_VER }}
105-
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
106-
tags: ${{ inputs.BASE_TAG }}-${{ matrix.ARCH }}
107-
- name: Build notebooks image
108-
uses: docker/build-push-action@v6
109-
with:
110-
context: context
111-
file: Dockerfile
112-
target: notebooks
113-
push: true
114-
pull: true
115-
build-args: |
116-
CUDA_VER=${{ inputs.CUDA_VER }}
117-
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
118-
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
119-
LINUX_VER=${{ inputs.LINUX_VER }}
120-
PYTHON_VER=${{ inputs.PYTHON_VER }}
121-
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
122-
tags: ${{ inputs.NOTEBOOKS_TAG }}-${{ matrix.ARCH }}
12385
- name: Build cuVS Benchmarks GPU image
12486
uses: docker/build-push-action@v6
12587
with:
@@ -134,20 +96,20 @@ jobs:
13496
PYTHON_VER=${{ inputs.PYTHON_VER }}
13597
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
13698
tags: ${{ inputs.CUVS_BENCH_TAG }}-${{ matrix.ARCH }}
137-
- name: Build cuVS Benchmarks GPU with datasets image
138-
uses: docker/build-push-action@v6
139-
with:
140-
context: context
141-
file: cuvs-bench/gpu/Dockerfile
142-
target: cuvs-bench-datasets
143-
push: true
144-
pull: true
145-
build-args: |
146-
CUDA_VER=${{ inputs.CUDA_VER }}
147-
LINUX_VER=${{ inputs.LINUX_VER }}
148-
PYTHON_VER=${{ inputs.PYTHON_VER }}
149-
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
150-
tags: ${{ inputs.CUVS_BENCH_DATASETS_TAG }}-${{ matrix.ARCH }}
99+
# - name: Build cuVS Benchmarks GPU with datasets image
100+
# uses: docker/build-push-action@v6
101+
# with:
102+
# context: context
103+
# file: cuvs-bench/gpu/Dockerfile
104+
# target: cuvs-bench-datasets
105+
# push: true
106+
# pull: true
107+
# build-args: |
108+
# CUDA_VER=${{ inputs.CUDA_VER }}
109+
# LINUX_VER=${{ inputs.LINUX_VER }}
110+
# PYTHON_VER=${{ inputs.PYTHON_VER }}
111+
# RAPIDS_VER=${{ inputs.RAPIDS_VER }}
112+
# tags: ${{ inputs.CUVS_BENCH_DATASETS_TAG }}-${{ matrix.ARCH }}
151113
- name: Build cuVS Benchmarks CPU image
152114
if: inputs.BUILD_CUVS_BENCH_CPU_IMAGE
153115
uses: docker/build-push-action@v6
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
name: Build and push image variant
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
ARCHES:
7+
required: true
8+
type: string
9+
CUDA_VER:
10+
required: true
11+
type: string
12+
LINUX_DISTRO:
13+
required: true
14+
type: string
15+
LINUX_DISTRO_VER:
16+
required: true
17+
type: string
18+
LINUX_VER:
19+
required: true
20+
type: string
21+
PYTHON_VER:
22+
required: true
23+
type: string
24+
RAPIDS_VER:
25+
required: true
26+
type: string
27+
BASE_TAG:
28+
required: true
29+
type: string
30+
NOTEBOOKS_TAG:
31+
required: true
32+
type: string
33+
34+
jobs:
35+
build:
36+
strategy:
37+
matrix:
38+
ARCH: ${{ fromJSON(inputs.ARCHES) }}
39+
CUDA_VER: ["${{ inputs.CUDA_VER }}"]
40+
LINUX_VER: ["${{ inputs.LINUX_VER }}"]
41+
PYTHON_VER: ["${{ inputs.PYTHON_VER }}"]
42+
RAPIDS_VER: ["${{ inputs.RAPIDS_VER }}"]
43+
fail-fast: false
44+
runs-on: "linux-${{ matrix.ARCH }}-cpu4"
45+
steps:
46+
- name: Checkout
47+
uses: actions/checkout@v4
48+
with:
49+
fetch-depth: 0
50+
- name: Install gha-tools
51+
run: |
52+
mkdir -p /tmp/gha-tools
53+
curl -s -L 'https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz' | tar -xz -C /tmp/gha-tools
54+
echo "/tmp/gha-tools" >> "${GITHUB_PATH}"
55+
- name: Clean up condarc for release builds
56+
run: |
57+
GIT_DESCRIBE_TAG="$(git describe --tags --abbrev=0)"
58+
GIT_DESCRIBE_TAG="${GIT_DESCRIBE_TAG:1}" # remove leading 'v'
59+
if [[ ! $GIT_DESCRIBE_TAG =~ [a-z] ]]; then
60+
rapids-logger "Most recent tag is for release, adding `rapidsai` channel and removing `rapidsai-nightly` and `dask/label/dev` channels."
61+
sed -i 's|rapidsai-nightly|rapidsai|;\|dask/label/dev|d' context/condarc
62+
else
63+
rapids-logger "Most recent tag is an alpha. Build will use nightly channels."
64+
fi
65+
- name: Login to DockerHub
66+
uses: docker/login-action@v3
67+
with:
68+
username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
69+
password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
70+
- name: Set up Docker Context for Buildx
71+
id: buildx-context
72+
run: |
73+
docker context create builders
74+
- name: Set up Docker Buildx
75+
uses: docker/setup-buildx-action@v3
76+
with:
77+
driver: docker
78+
endpoint: builders
79+
- name: Build base image
80+
uses: docker/build-push-action@v6
81+
with:
82+
context: context
83+
file: Dockerfile
84+
target: base
85+
push: true
86+
pull: true
87+
build-args: |
88+
CUDA_VER=${{ inputs.CUDA_VER }}
89+
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
90+
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
91+
LINUX_VER=${{ inputs.LINUX_VER }}
92+
PYTHON_VER=${{ inputs.PYTHON_VER }}
93+
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
94+
tags: ${{ inputs.BASE_TAG }}-${{ matrix.ARCH }}
95+
- name: Build notebooks image
96+
uses: docker/build-push-action@v6
97+
with:
98+
context: context
99+
file: Dockerfile
100+
target: notebooks
101+
push: true
102+
pull: true
103+
build-args: |
104+
CUDA_VER=${{ inputs.CUDA_VER }}
105+
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
106+
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
107+
LINUX_VER=${{ inputs.LINUX_VER }}
108+
PYTHON_VER=${{ inputs.PYTHON_VER }}
109+
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
110+
tags: ${{ inputs.NOTEBOOKS_TAG }}-${{ matrix.ARCH }}

.github/workflows/build-test-publish-images.yml

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ jobs:
3636
needs:
3737
- checks
3838
- compute-matrix
39-
- build
40-
- build-multiarch-manifest
39+
- build-rapids
40+
- build-rapids-multiarch-manifest
41+
- build-cuvs
42+
- build-cuvs-multiarch-manifest
4143
- test
4244
secrets: inherit
4345
uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
@@ -137,13 +139,13 @@ jobs:
137139
export TEST_MATRIX
138140
139141
echo "TEST_MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c '{include: .}')" | tee --append "${GITHUB_OUTPUT}"
140-
build:
142+
build-rapids:
141143
needs: [checks, compute-matrix]
142144
strategy:
143145
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
144146
fail-fast: false
145147
secrets: inherit
146-
uses: ./.github/workflows/build-image.yml
148+
uses: ./.github/workflows/build-rapids-image.yml
147149
with:
148150
ARCHES: ${{ toJSON(matrix.ARCHES) }}
149151
CUDA_VER: ${{ matrix.CUDA_VER }}
@@ -152,7 +154,6 @@ jobs:
152154
LINUX_VER: ${{ matrix.LINUX_VER }}
153155
PYTHON_VER: ${{ matrix.PYTHON_VER }}
154156
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
155-
BUILD_CUVS_BENCH_CPU_IMAGE: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
156157
BASE_TAG:
157158
"rapidsai/${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }}:\
158159
${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }}\
@@ -167,6 +168,22 @@ jobs:
167168
${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\
168169
cuda${{ matrix.CUDA_TAG }}-\
169170
py${{ matrix.PYTHON_VER }}"
171+
build-cuvs:
172+
needs: [checks, compute-matrix]
173+
strategy:
174+
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
175+
fail-fast: false
176+
secrets: inherit
177+
uses: ./.github/workflows/build-cuvs-image.yml
178+
with:
179+
ARCHES: ${{ toJSON(matrix.ARCHES) }}
180+
CUDA_VER: ${{ matrix.CUDA_VER }}
181+
LINUX_DISTRO: ${{ matrix.LINUX_DISTRO }}
182+
LINUX_DISTRO_VER: ${{ matrix.LINUX_DISTRO_VER }}
183+
LINUX_VER: ${{ matrix.LINUX_VER }}
184+
PYTHON_VER: ${{ matrix.PYTHON_VER }}
185+
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
186+
BUILD_CUVS_BENCH_CPU_IMAGE: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
170187
CUVS_BENCH_TAG:
171188
"rapidsai/${{ needs.compute-matrix.outputs.CUVS_BENCH_IMAGE_REPO }}:\
172189
${{ needs.compute-matrix.outputs.CUVS_BENCH_TAG_PREFIX }}\
@@ -187,8 +204,8 @@ jobs:
187204
${{ needs.compute-matrix.outputs.RAPIDS_VER }}\
188205
${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\
189206
py${{ matrix.PYTHON_VER }}"
190-
build-multiarch-manifest:
191-
needs: [build, compute-matrix]
207+
build-rapids-multiarch-manifest:
208+
needs: [build-rapids, compute-matrix]
192209
strategy:
193210
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
194211
fail-fast: false
@@ -206,7 +223,6 @@ jobs:
206223
- name: Create multiarch manifest
207224
shell: bash
208225
env:
209-
CUVS_BENCH_CPU_IMAGE_BUILT: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
210226
BASE_IMAGE_REPO: ${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }}
211227
BASE_TAG_PREFIX: ${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }}
212228
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
@@ -215,6 +231,34 @@ jobs:
215231
PYTHON_VER: ${{ matrix.PYTHON_VER }}
216232
NOTEBOOKS_IMAGE_REPO: ${{ needs.compute-matrix.outputs.NOTEBOOKS_IMAGE_REPO }}
217233
NOTEBOOKS_TAG_PREFIX: ${{ needs.compute-matrix.outputs.NOTEBOOKS_TAG_PREFIX }}
234+
GPUCIBOT_DOCKERHUB_USER: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
235+
GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
236+
ARCHES: ${{ toJSON(matrix.ARCHES) }}
237+
run: ci/create-rapids-multiarch-manifest.sh
238+
build-cuvs-multiarch-manifest:
239+
needs: [build-cuvs, compute-matrix]
240+
strategy:
241+
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
242+
fail-fast: false
243+
runs-on: ubuntu-latest
244+
steps:
245+
- name: Checkout
246+
uses: actions/checkout@v4
247+
with:
248+
fetch-depth: 0
249+
- name: Login to DockerHub
250+
uses: docker/login-action@v3
251+
with:
252+
username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
253+
password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
254+
- name: Create multiarch manifest
255+
shell: bash
256+
env:
257+
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
258+
ALPHA_TAG: ${{ needs.compute-matrix.outputs.ALPHA_TAG }}
259+
CUDA_TAG: ${{ matrix.CUDA_TAG }}
260+
PYTHON_VER: ${{ matrix.PYTHON_VER }}
261+
CUVS_BENCH_CPU_IMAGE_BUILT: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
218262
CUVS_BENCH_IMAGE_REPO: ${{ needs.compute-matrix.outputs.CUVS_BENCH_IMAGE_REPO }}
219263
CUVS_BENCH_TAG_PREFIX: ${{ needs.compute-matrix.outputs.CUVS_BENCH_TAG_PREFIX }}
220264
CUVS_BENCH_DATASETS_IMAGE_REPO: ${{ needs.compute-matrix.outputs.CUVS_BENCH_DATASETS_IMAGE_REPO }}
@@ -224,9 +268,9 @@ jobs:
224268
GPUCIBOT_DOCKERHUB_USER: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
225269
GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
226270
ARCHES: ${{ toJSON(matrix.ARCHES) }}
227-
run: ci/create-multiarch-manifest.sh
271+
run: ci/create-cuvs-multiarch-manifest.sh
228272
test:
229-
needs: [compute-matrix, build]
273+
needs: [compute-matrix, build-rapids]
230274
if: inputs.run_tests
231275
strategy:
232276
matrix: ${{ fromJSON(needs.compute-matrix.outputs.TEST_MATRIX) }}

Dockerfile

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ ARG RAPIDS_VER
4242

4343
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
4444

45+
RUN <<EOF
46+
apt-get update
47+
apt-get install -y wget
48+
wget https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz -O - | tar -xz -C /usr/local/bin
49+
apt-get purge -y --auto-remove wget
50+
rm -rf /var/lib/apt/lists/*
51+
EOF
4552
RUN useradd -rm -d /home/rapids -s /bin/bash -g conda -u 1001 rapids
4653

4754
USER rapids
@@ -57,7 +64,7 @@ conda config --show-sources
5764
conda list --show-channel-urls
5865

5966
# Install RAPIDS
60-
mamba install -y -n base \
67+
rapids-mamba-retry install -y -n base \
6168
"rapids=${RAPIDS_VER}.*" \
6269
"python=${PYTHON_VER}.*" \
6370
"cuda-version=${CUDA_VER%.*}.*" \
@@ -90,12 +97,12 @@ COPY --from=dependencies --chown=rapids /test_notebooks_dependencies.yaml test_n
9097
COPY --from=dependencies --chown=rapids /notebooks /home/rapids/notebooks
9198

9299
RUN <<EOF
93-
mamba env update -n base -f test_notebooks_dependencies.yaml
100+
rapids-mamba-retry env update -n base -f test_notebooks_dependencies.yaml
94101
conda clean -afy
95102
EOF
96103

97104
RUN <<EOF
98-
mamba install -y -n base \
105+
rapids-mamba-retry install -y -n base \
99106
"jupyterlab=4" \
100107
dask-labextension \
101108
jupyterlab-nvdashboard

ci/common.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/bin/bash
2+
3+
set -eEuo pipefail
4+
5+
# Authenticate and retrieve DockerHub token
6+
HUB_TOKEN=$(
7+
curl -s -H "Content-Type: application/json" \
8+
-X POST \
9+
-d "{\"username\": \"$GPUCIBOT_DOCKERHUB_USER\", \"password\": \"$GPUCIBOT_DOCKERHUB_TOKEN\"}" \
10+
https://hub.docker.com/v2/users/login/ | jq -r .token \
11+
)
12+
echo "::add-mask::${HUB_TOKEN}"
13+
export HUB_TOKEN
14+
15+
# Function to check if a Docker tag exists
16+
check_tag_exists() {
17+
local repo="$1"
18+
local tag="$2"
19+
local exists
20+
exists=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: JWT $HUB_TOKEN" \
21+
"https://hub.docker.com/v2/repositories/${org}/${repo}/tags/${tag}/")
22+
23+
if [ "$exists" -ne 200 ]; then
24+
echo "Error: Required image tag ${repo}:${tag} does not exist. This implies that the image was not built successfully in the build job."
25+
exit 1
26+
fi
27+
}
28+
29+
export org="rapidsai"

0 commit comments

Comments
 (0)