Skip to content

Commit 385765e

Browse files
authored
Use the sccache-dist build cluster for RAPIDS CI jobs (#7014)
* use the sccache-dist build cluster for RAPIDS CI jobs [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * run devcontainer-utils lifecycle scripts [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * define GH_TOKEN [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * cpu32 -> cpu16 [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * remove preprocessor cache key prefix [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * increase nofile ulimit [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids]
1 parent 7e6770d commit 385765e

File tree

4 files changed

+78
-37
lines changed

4 files changed

+78
-37
lines changed

.devcontainer/cccl-entrypoint.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,21 @@
44

55
set -e;
66

7+
if ! test -n "${DISABLE_SCCACHE:+x}" && test -n "${DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST:+x}" && ! test -n "${SCCACHE_DIST_URL:+x}"; then
8+
export SCCACHE_DIST_URL="https://$(dpkg --print-architecture).$(uname -s | tr '[:upper:]' '[:lower:]').sccache.rapids.nvidia.com";
9+
echo "export SCCACHE_DIST_URL=$SCCACHE_DIST_URL" >> ~/.bashrc;
10+
fi
11+
12+
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
13+
echo "::group::Initializing devcontainer..."
14+
fi
15+
716
devcontainer-utils-post-create-command;
817
devcontainer-utils-init-git;
918
devcontainer-utils-post-attach-command;
19+
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
20+
echo "::endgroup::"
21+
fi
1022

1123
if ! dpkg -s ca-certificates > /dev/null 2>&1; then
1224
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then

.github/workflows/build-rapids.yml

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ jobs:
5858
name: "${{ matrix.libs }}"
5959
if: needs.check-event.outputs.ok == 'true'
6060
needs: check-event
61-
runs-on: ${{ fromJSON(github.repository != 'NVIDIA/cccl' && '"ubuntu-latest"' || '"linux-amd64-cpu32"') }}
61+
runs-on: ${{ fromJSON(github.repository != 'NVIDIA/cccl' && '"ubuntu-latest"' || '"linux-amd64-cpu16"') }}
6262
strategy:
6363
fail-fast: false
6464
matrix:
@@ -88,6 +88,8 @@ jobs:
8888
CCCL_TAG: ${{ inputs.override_cccl_tag }}
8989
CCCL_VERSION: ${{ inputs.override_cccl_version }}
9090
CI: true
91+
CONDA_ENV_CREATE_QUIET: true
92+
GH_TOKEN: ${{ github.token }}
9193
RAPIDS_LIBS: ${{ matrix.libs }}
9294
# Uncomment any of these to customize the git repo and branch for a RAPIDS lib:
9395
# RAPIDS_cmake_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}'
@@ -102,11 +104,21 @@ jobs:
102104
# RAPIDS_raft_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}'
103105
# RAPIDS_rmm_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}'
104106
# RAPIDS_ucxx_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}'
107+
# Build cluster auth
108+
SCCACHE_DIST_TOKEN: "${{ secrets.SCCACHE_DIST_TOKEN }}"
109+
SCCACHE_DIST_AUTH_TOKEN_VAR: "SCCACHE_DIST_TOKEN"
110+
# Retry intermittent failures
111+
SCCACHE_DIST_MAX_RETRIES: "inf"
112+
# Never fallback to building locally, fail instead
113+
SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: "false"
114+
SCCACHE_IDLE_TIMEOUT: 0
105115
run: |
106116
cat <<"EOF" > "$RUNNER_TEMP/ci.sh"
107117
#! /usr/bin/env bash
108118
set -eo pipefail
109119
120+
ulimit -n $(ulimit -Hn)
121+
110122
declare -a failures
111123
112124
_print_err_exit_msg() {
@@ -122,7 +134,7 @@ jobs:
122134
cat <<____EOF
123135
RAPIDS_LIBS='${RAPIDS_LIBS}'$(for lib in cmake ${RAPIDS_LIBS}; do var=RAPIDS_${lib//-/_}_GIT_REPO; if test -v "$var" && test -n "${!var}"; then echo -n " $var='${!var}'"; fi; done) \\
124136
.devcontainer/launch.sh -d -c ${{matrix.cuda}} -H rapids-conda -- ./ci/rapids/rapids-entrypoint.sh \\
125-
/bin/bash -li -c 'uninstall-all -j -qqq && clean-all -j && build-all -j -v || exec /bin/bash -li'
137+
/bin/bash -li -c 'uninstall-all -j -qqq && clean-all -j && build-all -j0 -v || exec /bin/bash -li'
126138
____EOF
127139
echo ""
128140
echo "For additional information, see:"
@@ -165,14 +177,9 @@ jobs:
165177
--env "AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-}" \
166178
--env "AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}" \
167179
--env "CONDA_ENV_CREATE_QUIET=true" \
168-
--env "CCCL_TAG=${CCCL_TAG}" \
169-
--env "CCCL_VERSION=${CCCL_VERSION}" \
170-
--env "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=true" \
171-
--env "INFER_NUM_DEVICE_ARCHITECTURES=true" \
172-
--env "MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=100" \
173-
--env "SCCACHE_BUCKET=${SCCACHE_BUCKET:-}" \
174-
--env "SCCACHE_REGION=${SCCACHE_REGION:-}" \
175-
--env "SCCACHE_IDLE_TIMEOUT=0" \
180+
--env "CCCL_TAG=$CCCL_TAG" \
181+
--env "CCCL_VERSION=$CCCL_VERSION" \
182+
--env "GH_TOKEN=$GH_TOKEN" \
176183
--env "GITHUB_ACTIONS=$GITHUB_ACTIONS" \
177184
--env "GITHUB_SHA=$GITHUB_SHA" \
178185
--env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \

ci/rapids/cuda13.0-conda/devcontainer.json

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,61 @@
11
{
2-
"image": "rapidsai/devcontainers:25.12-cpp-mambaforge-ubuntu24.04",
2+
"image": "rapidsai/devcontainers:26.02-cpp-mambaforge",
33
"runArgs": [
44
"--init",
55
"--rm",
66
"--name",
7-
"${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-25.12-cuda13.0-conda"
7+
"${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-26.02-cuda13.0-conda",
8+
"--ulimit",
9+
"nofile=500000"
810
],
911
"hostRequirements": {"gpu": "optional"},
10-
"features": {
11-
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
12-
},
13-
"overrideFeatureInstallOrder": [
14-
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
15-
],
1612
"containerEnv": {
13+
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
1714
"CI": "${localEnv:CI}",
18-
"CUDAARCHS": "75-real",
1915
"CUDA_VERSION": "13.0",
16+
"CUDAARCHS": "75-real",
2017
"DEFAULT_CONDA_ENV": "rapids",
21-
"PYTHONSAFEPATH": "1",
22-
"PYTHONUNBUFFERED": "1",
23-
"PYTHONDONTWRITEBYTECODE": "1",
24-
"PYTHON_PACKAGE_MANAGER": "conda",
25-
"SCCACHE_REGION": "us-east-2",
26-
"SCCACHE_BUCKET": "rapids-sccache-devs",
27-
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
18+
"DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "true",
2819
"HISTFILE": "/home/coder/.cache/._bash_history",
20+
"INFER_NUM_DEVICE_ARCHITECTURES": "1",
2921
"LIBCUDF_KERNEL_CACHE_PATH": "/home/coder/cudf/cpp/build/latest/jitify_cache",
30-
"RAPIDS_LIBS": "${localEnv:RAPIDS_LIBS}",
22+
"MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL": "20",
23+
"PYTHON_PACKAGE_MANAGER": "conda",
24+
"PYTHONDONTWRITEBYTECODE": "1",
25+
"PYTHONSAFEPATH": "1",
26+
"PYTHONUNBUFFERED": "1",
3127
"RAPIDS_cmake_GIT_REPO": "${localEnv:RAPIDS_cmake_GIT_REPO}",
32-
"RAPIDS_rmm_GIT_REPO": "${localEnv:RAPIDS_rmm_GIT_REPO}",
33-
"RAPIDS_ucxx_GIT_REPO": "${localEnv:RAPIDS_ucxx_GIT_REPO}",
34-
"RAPIDS_kvikio_GIT_REPO": "${localEnv:RAPIDS_kvikio_GIT_REPO}",
3528
"RAPIDS_cudf_GIT_REPO": "${localEnv:RAPIDS_cudf_GIT_REPO}",
36-
"RAPIDS_raft_GIT_REPO": "${localEnv:RAPIDS_raft_GIT_REPO}",
37-
"RAPIDS_cuvs_GIT_REPO": "${localEnv:RAPIDS_cuvs_GIT_REPO}",
38-
"RAPIDS_cumlprims_mg_GIT_REPO": "${localEnv:RAPIDS_cumlprims_mg_GIT_REPO}",
39-
"RAPIDS_cuml_GIT_REPO": "${localEnv:RAPIDS_cuml_GIT_REPO}",
4029
"RAPIDS_cugraph_GIT_REPO": "${localEnv:RAPIDS_cugraph_GIT_REPO}",
41-
"RAPIDS_cugraph_gnn_GIT_REPO": "${localEnv:RAPIDS_cugraph_gnn_GIT_REPO}"
30+
"RAPIDS_cugraph_gnn_GIT_REPO": "${localEnv:RAPIDS_cugraph_gnn_GIT_REPO}",
31+
"RAPIDS_cuml_GIT_REPO": "${localEnv:RAPIDS_cuml_GIT_REPO}",
32+
"RAPIDS_cumlprims_mg_GIT_REPO": "${localEnv:RAPIDS_cumlprims_mg_GIT_REPO}",
33+
"RAPIDS_cuvs_GIT_REPO": "${localEnv:RAPIDS_cuvs_GIT_REPO}",
34+
"RAPIDS_kvikio_GIT_REPO": "${localEnv:RAPIDS_kvikio_GIT_REPO}",
35+
"RAPIDS_LIBS": "${localEnv:RAPIDS_LIBS}",
36+
"RAPIDS_raft_GIT_REPO": "${localEnv:RAPIDS_raft_GIT_REPO}",
37+
"RAPIDS_rmm_GIT_REPO": "${localEnv:RAPIDS_rmm_GIT_REPO}",
38+
"RAPIDS_ucxx_GIT_REPO": "${localEnv:RAPIDS_ucxx_GIT_REPO}",
39+
"SCCACHE_BUCKET": "rapids-sccache-devs",
40+
"SCCACHE_DIST_AUTH_TOKEN_VAR": "${localEnv:SCCACHE_DIST_AUTH_TOKEN_VAR}",
41+
"SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE": "${localEnv:SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE:true}",
42+
"SCCACHE_DIST_MAX_RETRIES": "${localEnv:SCCACHE_DIST_MAX_RETRIES:4}",
43+
"SCCACHE_DIST_REQUEST_TIMEOUT": "${localEnv:SCCACHE_DIST_REQUEST_TIMEOUT:7140}",
44+
"SCCACHE_DIST_TOKEN": "${localEnv:SCCACHE_DIST_TOKEN}",
45+
"SCCACHE_IDLE_TIMEOUT": "${localEnv:SCCACHE_IDLE_TIMEOUT:0}",
46+
"SCCACHE_REGION": "us-east-2",
47+
"SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true",
48+
"SCCACHE_SERVER_LOG": "${localEnv:SCCACHE_SERVER_LOG:sccache=debug}"
4249
},
4350
"initializeCommand": [
4451
"/bin/bash",
4552
"-c",
46-
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config} ${localWorkspaceFolder}/ci/rapids/.{conda,log/devcontainer-utils} ${localWorkspaceFolder}/ci/rapids/.repos/{rmm,kvikio,ucxx,cudf,raft,cuvs,cumlprims_mg,cuml,cugraph,cugraph-gnn}"
53+
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config,local/state} ${localWorkspaceFolder}/ci/rapids/.{conda,log/devcontainer-utils} ${localWorkspaceFolder}/ci/rapids/.repos/{rmm,kvikio,ucxx,cudf,raft,cuvs,cumlprims_mg,cuml,cugraph,cugraph-gnn}"
4754
],
4855
"postCreateCommand": [
4956
"/bin/bash",
5057
"-c",
51-
"if [ ${CI:-false} = 'false' ]; then . /home/coder/cccl/ci/rapids/post-create-command.sh; fi"
58+
"if [ ${CI:-false} = 'false' ]; then . /home/coder/cccl/ci/rapids/post-create-command.sh; fi; if test -z \"${DISABLE_SCCACHE:+x}\"; then echo \"export SCCACHE_DIST_URL='https://$(dpkg --print-architecture).$(uname -s | tr '[:upper:]' '[:lower:]').sccache.rapids.nvidia.com'\" >> /home/coder/.bashrc; fi"
5259
],
5360
"postAttachCommand": [
5461
"/bin/bash",
@@ -61,6 +68,7 @@
6168
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
6269
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
6370
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
71+
"source=${localWorkspaceFolder}/.local/state,target=/home/coder/.local/state,type=bind,consistency=consistent",
6472
"source=${localWorkspaceFolder}/ci/rapids/.repos/rmm,target=/home/coder/rmm,type=bind,consistency=consistent",
6573
"source=${localWorkspaceFolder}/ci/rapids/.repos/kvikio,target=/home/coder/kvikio,type=bind,consistency=consistent",
6674
"source=${localWorkspaceFolder}/ci/rapids/.repos/ucxx,target=/home/coder/ucxx,type=bind,consistency=consistent",
@@ -74,6 +82,12 @@
7482
"source=${localWorkspaceFolder}/ci/rapids/.conda,target=/home/coder/.conda,type=bind,consistency=consistent",
7583
"source=${localWorkspaceFolder}/ci/rapids/.log/devcontainer-utils,target=/var/log/devcontainer-utils,type=bind,consistency=consistent"
7684
],
85+
"features": {
86+
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
87+
},
88+
"overrideFeatureInstallOrder": [
89+
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
90+
],
7791
"customizations": {
7892
"vscode": {
7993
"extensions": [

ci/rapids/rapids-entrypoint.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,17 @@
44

55
set -e;
66

7+
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
8+
echo "::group::Cloning RAPIDS..."
9+
fi
10+
711
ci/rapids/post-create-command.sh;
812
rapids-post-start-command -f;
913

14+
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
15+
echo "::endgroup::"
16+
fi
17+
1018
if test $# -gt 0; then
1119
exec "$@";
1220
else

0 commit comments

Comments
 (0)