Skip to content

Commit fcd3d05

Browse files
committed
test new path for cache and build-jax
1 parent d396401 commit fcd3d05

File tree

4 files changed

+209
-9
lines changed

4 files changed

+209
-9
lines changed

.github/actions/build-container/action.yml

Lines changed: 147 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,17 @@ inputs:
5151
required: true
5252
default: ""
5353
bazel-remote-cache-url:
54-
description: "URL of the Bazel remote cache to use for building the image"
55-
required: true
54+
description: "URL of the Bazel remote cache to use for building the image (http/grpc). Leave empty to use Dockerfile-default cache mount paths."
55+
required: false
5656
default: ""
57+
CACHE_REGISTRY:
58+
description: "OCI registry used for BuildKit layer cache (cache-to/cache-from). Must be writable by the GITHUB_TOKEN."
59+
required: false
60+
default: "ghcr.io/nvidia/jax-toolbox-buildcache"
61+
ENABLE_BAZEL_REPO_CACHE:
62+
description: "Enable Bazel repository-cache save/restore via actions/cache and build-context injection. Set to 'true' only for containers that run Bazel (i.e. build-jax)."
63+
required: false
64+
default: "false"
5765

5866
outputs:
5967
DOCKER_TAG_MEALKIT:
@@ -73,6 +81,60 @@ runs:
7381
echo 'UPLD_IMAGE=ghcr.io/nvidia/jax-toolbox-internal' >> $GITHUB_ENV
7482
echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV
7583
84+
- name: Set up build cache environment
85+
id: cache-env
86+
shell: bash
87+
run: |
88+
# Sanitize branch name: replace / and non-tag chars, cap at 100 chars
89+
BRANCH=$(echo "${{ github.ref_name }}" \
90+
| sed 's|/|-|g' \
91+
| tr -cs 'a-zA-Z0-9._-' '-' \
92+
| sed 's/^-//;s/-$//' \
93+
| cut -c1-100)
94+
95+
CONTAINER="${{ inputs.CONTAINER_NAME }}"
96+
ARCH="${{ inputs.ARCHITECTURE }}"
97+
REGISTRY="${{ inputs.CACHE_REGISTRY }}"
98+
99+
echo "CACHE_REF_MEALKIT_BRANCH=${REGISTRY}:${CONTAINER}-${ARCH}-${BRANCH}-mealkit" >> $GITHUB_ENV
100+
echo "CACHE_REF_MEALKIT_MAIN=${REGISTRY}:${CONTAINER}-${ARCH}-main-mealkit" >> $GITHUB_ENV
101+
echo "CACHE_REF_FINAL_BRANCH=${REGISTRY}:${CONTAINER}-${ARCH}-${BRANCH}-final" >> $GITHUB_ENV
102+
echo "CACHE_REF_FINAL_MAIN=${REGISTRY}:${CONTAINER}-${ARCH}-main-final" >> $GITHUB_ENV
103+
104+
# Bazel repo-cache key: keyed on arch only so that restore-keys
105+
# always finds the most recent cache for this architecture, regardless of
106+
# which JAX/XLA commit is being built. EXTRA_BUILD_ARGS contains
107+
# URLREF_JAX=...#<commit> which changes every upstream push; including
108+
# it would cause a primary-key miss on every single run.
109+
echo "BAZEL_REPO_CACHE_KEY=bazel-repo-${ARCH}" >> $GITHUB_ENV
110+
111+
# Pass BAZEL_CACHE build-arg only when a remote URL is explicitly given;
112+
# otherwise the Dockerfile default (/cache/bazel-disk) applies.
113+
if [[ -n "${{ inputs.bazel-remote-cache-url }}" ]]; then
114+
echo "BAZEL_CACHE_BUILD_ARG=BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}" >> $GITHUB_ENV
115+
else
116+
echo "BAZEL_CACHE_BUILD_ARG=" >> $GITHUB_ENV
117+
fi
118+
119+
# Ensure the directory exists regardless of whether we restore a cache.
120+
# An empty directory fed via --build-context is equivalent to FROM scratch.
121+
- name: Prepare Bazel repo cache directory
122+
shell: bash
123+
run: mkdir -p /tmp/bazel-repo-cache
124+
125+
- name: Restore Bazel repo cache
126+
id: restore-bazel-repo-cache
127+
if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
128+
uses: actions/cache/restore@v4
129+
with:
130+
path: /tmp/bazel-repo-cache
131+
# Key: arch-scoped + run_id so each successful run saves a fresh entry.
132+
# restore-keys falls back to the most recent cache for this arch,
133+
# giving a warm repo cache even when the exact SHA differs.
134+
key: ${{ env.BAZEL_REPO_CACHE_KEY }}-${{ github.run_id }}
135+
restore-keys: |
136+
${{ env.BAZEL_REPO_CACHE_KEY }}-
137+
76138
- name: Setup SSH
77139
id: setup-ssh
78140
uses: ./.github/actions/setup-ssh
@@ -136,9 +198,23 @@ runs:
136198
"SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}"
137199
build-args: |
138200
BASE_IMAGE=${{ inputs.BASE_IMAGE }}
139-
BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}
140201
BUILD_DATE=${{ inputs.BUILD_DATE }}
202+
${{ env.BAZEL_CACHE_BUILD_ARG }}
141203
${{ inputs.EXTRA_BUILD_ARGS }}
204+
# Inject pre-restored Bazel repo cache as a named build context.
205+
# The Dockerfile declares `FROM scratch AS bazel-repo-seed`; passing this
206+
# context overrides that stage with the contents of the local directory.
207+
# When the directory is empty (cold start) the bind-mount is a no-op.
208+
build-contexts: |
209+
bazel-repo-seed=/tmp/bazel-repo-cache
210+
cache-from: |
211+
type=registry,ref=${{ env.CACHE_REF_MEALKIT_BRANCH }}
212+
type=registry,ref=${{ env.CACHE_REF_MEALKIT_MAIN }}
213+
# mode=max: cache ALL intermediate stages (builder, mealkit) so
214+
# full Bazel layer is reusable on the next run.
215+
# ignore-error=true: cache push failure must never kill the build.
216+
cache-to: type=registry,ref=${{ env.CACHE_REF_MEALKIT_BRANCH }},mode=max,oci-mediatypes=true,image-manifest=true,compression=zstd,ignore-error=true
217+
142218
# FINAL IMAGE BUILD
143219
- name: Set docker metadata - final
144220
id: final-metadata
@@ -169,9 +245,76 @@ runs:
169245
"SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}"
170246
build-args: |
171247
BASE_IMAGE=${{ inputs.BASE_IMAGE }}
172-
BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}
173248
BUILD_DATE=${{ inputs.BUILD_DATE }}
249+
${{ env.BAZEL_CACHE_BUILD_ARG }}
174250
${{ inputs.EXTRA_BUILD_ARGS }}
251+
build-contexts: |
252+
bazel-repo-seed=/tmp/bazel-repo-cache
253+
cache-from: |
254+
type=registry,ref=${{ env.CACHE_REF_FINAL_BRANCH }}
255+
type=registry,ref=${{ env.CACHE_REF_FINAL_MAIN }}
256+
type=registry,ref=${{ env.CACHE_REF_MEALKIT_BRANCH }}
257+
type=registry,ref=${{ env.CACHE_REF_MEALKIT_MAIN }}
258+
# mode=min: builder+mealkit layers are already cached under CACHE_REF_MEALKIT_BRANCH
259+
# (mode=max above). Only cache the final stage's unique layers here.
260+
cache-to: type=registry,ref=${{ env.CACHE_REF_FINAL_BRANCH }},mode=min,oci-mediatypes=true,image-manifest=true,compression=zstd,ignore-error=true
261+
262+
# BAZEL REPO CACHE EXPORT
263+
# This step extracts /cache/bazel-repo from the builder stage back to the
264+
# runner so it can be saved via actions/cache.
265+
#
266+
# It builds only the lightweight `bazel-repo-export` stage (FROM scratch +
267+
# COPY --from=builder), pulling the already-cached builder layer from the
268+
# registry. No Bazel re-invocation occurs.
269+
#
270+
# Cost: one registry pull of the builder layer + a local copy.
271+
# This is bounded (repo cache only, NOT disk cache) and will NOT trigger
272+
# the multi-hour `[cache-export] exporting to client directory` stall that
273+
# afflicted the old type=local full-cache-export path.
274+
- name: Export Bazel repo cache
275+
id: export-bazel-repo-cache
276+
if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' && steps.mealkit-build.outcome == 'success'
277+
shell: bash
278+
env:
279+
EXTRA_BUILD_ARGS: ${{ inputs.EXTRA_BUILD_ARGS }}
280+
run: |
281+
# Convert newline-separated build args into individual --build-arg flags
282+
BUILD_ARG_FLAGS=()
283+
while IFS= read -r line; do
284+
[[ -z "$line" ]] && continue
285+
BUILD_ARG_FLAGS+=(--build-arg "$line")
286+
done <<< "$EXTRA_BUILD_ARGS"
287+
288+
# Export to a staging dir to avoid a read/write collision:
289+
# the build reads /tmp/bazel-repo-cache via --build-context and
290+
# would corrupt the seed if we wrote to the same path concurrently.
291+
rm -rf /tmp/bazel-repo-cache-new
292+
293+
docker buildx build \
294+
--platform "linux/${{ inputs.ARCHITECTURE }}" \
295+
--file "${{ inputs.DOCKERFILE }}" \
296+
--target bazel-repo-export \
297+
--output "type=local,dest=/tmp/bazel-repo-cache-new" \
298+
--cache-from "type=registry,ref=${{ env.CACHE_REF_MEALKIT_BRANCH }}" \
299+
--cache-from "type=registry,ref=${{ env.CACHE_REF_MEALKIT_MAIN }}" \
300+
--build-arg "BASE_IMAGE=${{ inputs.BASE_IMAGE }}" \
301+
--build-arg "BUILD_DATE=${{ inputs.BUILD_DATE }}" \
302+
--build-context "bazel-repo-seed=/tmp/bazel-repo-cache" \
303+
"${BUILD_ARG_FLAGS[@]}" \
304+
"${{ inputs.DOCKER_CONTEXT }}"
305+
306+
# Swap staging into the canonical location for actions/cache/save
307+
rm -rf /tmp/bazel-repo-cache
308+
mv /tmp/bazel-repo-cache-new /tmp/bazel-repo-cache
309+
310+
- name: Save Bazel repo cache
311+
if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' && steps.export-bazel-repo-cache.outcome == 'success'
312+
uses: actions/cache/save@v4
313+
with:
314+
path: /tmp/bazel-repo-cache
315+
# Same key as restore: arch + run_id. Each successful run overwrites
316+
# the previous entry, keeping the cache fresh without unbounded growth.
317+
key: ${{ env.BAZEL_REPO_CACHE_KEY }}-${{ github.run_id }}
175318

176319
# SITREP GENERATION
177320
- name: Generate sitrep

.github/container/Dockerfile.jax

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,34 @@ ARG SRC_PATH_TRANSFORMER_ENGINE=/opt/transformer-engine
1919
ARG GIT_USER_NAME="JAX Toolbox"
2020
ARG GIT_USER_EMAIL=jax@nvidia.com
2121

22-
ARG BAZEL_CACHE=/tmp
22+
ARG BAZEL_CACHE=/cache/bazel-disk
23+
ARG BAZEL_REPO_CACHE=/cache/bazel-repo
2324
ARG BUILD_DATE
2425

26+
###############################################################################
27+
## Bazel repo-cache seed stage
28+
## Overridden at build time via:
29+
## --build-context bazel-repo-seed=<dir>
30+
## When not overridden (FROM scratch), the bind-mount below is a no-op and
31+
## Bazel starts with an empty repository cache.
32+
###############################################################################
33+
34+
FROM scratch AS bazel-repo-seed
35+
2536
###############################################################################
2637
## Build JAX
2738
###############################################################################
2839

2940
FROM ${BASE_IMAGE} AS builder
41+
ARG TARGETARCH
3042
ARG URLREF_JAX
3143
ARG URLREF_TRANSFORMER_ENGINE
3244
ARG URLREF_XLA
3345
ARG SRC_PATH_JAX
3446
ARG SRC_PATH_TRANSFORMER_ENGINE
3547
ARG SRC_PATH_XLA
3648
ARG BAZEL_CACHE
49+
ARG BAZEL_REPO_CACHE
3750
ARG BUILD_PATH_JAXLIB
3851
ARG EXTRA_BAZEL_TARGETS
3952
ARG EXTRA_BUILD_JAX_ARGS
@@ -54,9 +67,21 @@ RUN ARCH="$(dpkg --print-architecture)" && \
5467
chmod +x /usr/local/bin/bazel
5568
# Populate ${BUILD_PATH_JAXLIB} with editable wheels; --no-install because
5669
# (a) this is the builder stage, and (b) pip-finalize.sh does the install
57-
RUN mkdir -p /builder/extra-targets/{bin,python} && \
70+
#
71+
# Cache mounts:
72+
# bazel-disk – arch-scoped BuildKit cache for Bazel --disk_cache; ephemeral
73+
# (not exported), avoids the multi-hour cache-export stall.
74+
# bazel-repo-seed bind-mount – read-only snapshot restored from actions/cache
75+
# via --build-context bazel-repo-seed=<dir> on the runner.
76+
# Falls back to empty scratch stage when no context is provided.
77+
RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,sharing=locked \
78+
--mount=type=bind,from=bazel-repo-seed,source=.,target=/cache/bazel-repo-seed,readonly \
79+
mkdir -p /cache/bazel-repo /builder/extra-targets/{bin,python} && \
80+
(cp -a /cache/bazel-repo-seed/. /cache/bazel-repo/ \
81+
|| echo "WARNING: bazel-repo-seed copy failed; proceeding with empty repo cache" >&2) && \
5882
build-jax.sh \
5983
--bazel-cache ${BAZEL_CACHE} \
84+
--bazel-repo-cache ${BAZEL_REPO_CACHE} \
6085
--build-path-jaxlib ${BUILD_PATH_JAXLIB} \
6186
--extra-targets "${EXTRA_BAZEL_TARGETS}" \
6287
--extra-target-dest /builder/extra-targets \
@@ -148,3 +173,17 @@ RUN install-nsys-jax.sh ${SRC_PATH_NSYS_JAX}
148173

149174
FROM mealkit AS final
150175
RUN pip-finalize.sh
176+
177+
###############################################################################
178+
## Bazel repo-cache export stage
179+
## Used exclusively by the CI action to extract /cache/bazel-repo from the
180+
## builder layer back to the runner filesystem, without a full image rebuild.
181+
##
182+
## Usage in action:
183+
## docker buildx build --target bazel-repo-export \
184+
## --output type=local,dest=/tmp/bazel-repo-cache \
185+
## --cache-from type=registry,ref=<mealkit-cache-ref> ...
186+
###############################################################################
187+
188+
FROM scratch AS bazel-repo-export
189+
COPY --from=builder /cache/bazel-repo /

.github/container/build-jax.sh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ usage() {
3737
echo " Usage: $0 [OPTIONS]"
3838
echo ""
3939
echo " OPTIONS DESCRIPTION"
40-
echo " --bazel-cache URI Path for local bazel cache or URL of remote bazel cache"
40+
echo " --bazel-cache URI Path for local bazel disk cache or URL of remote bazel cache"
4141
echo " --bazel-cache-namespace NAME Namespace for bazel cache content"
42+
echo " --bazel-repo-cache PATH Path for local bazel repository cache (--repository_cache)"
4243
echo " --build-param PARAM Param passed to the jaxlib build command. Can be passed many times."
4344
echo " --build-path-jaxlib PATH Editable install prefix for jaxlib and plugins"
4445
echo " --clean Delete local configuration and bazel cache"
@@ -63,6 +64,7 @@ usage() {
6364
# Set defaults
6465
BAZEL_CACHE=""
6566
BAZEL_CACHE_NAMESPACE="jax${CUDA_BASE_IMAGE:+:}${CUDA_BASE_IMAGE}"
67+
BAZEL_REPO_CACHE=""
6668
BUILD_PATH_JAXLIB="/opt/jaxlibs"
6769
BUILD_PARAM=""
6870
CLEAN=0
@@ -76,7 +78,7 @@ IS_RELEASE=0
7678
SRC_PATH_JAX="/opt/jax"
7779
SRC_PATH_XLA="/opt/xla"
7880

79-
args=$(getopt -o h,r --long bazel-cache:,bazel-cache-namespace:,build-param:,build-path-jaxlib:,clean,release,cpu-arch:,debug,extra-targets:,extra-target-dest:,no-clean,clean-only,help,install,no-install,src-path-jax:,src-path-xla:,sm: -- "$@")
81+
args=$(getopt -o h,r --long bazel-cache:,bazel-cache-namespace:,bazel-repo-cache:,build-param:,build-path-jaxlib:,clean,release,cpu-arch:,debug,extra-targets:,extra-target-dest:,no-clean,clean-only,help,install,no-install,src-path-jax:,src-path-xla:,sm: -- "$@")
8082
if [[ $? -ne 0 ]]; then
8183
exit 1
8284
fi
@@ -92,6 +94,10 @@ while [ : ]; do
9294
BAZEL_CACHE_NAMESPACE=$2
9395
shift 2
9496
;;
97+
--bazel-repo-cache)
98+
BAZEL_REPO_CACHE=$2
99+
shift 2
100+
;;
95101
--build-param)
96102
BUILD_PARAM="$BUILD_PARAM $2"
97103
shift 2
@@ -193,6 +199,10 @@ elif [[ ! -z "${BAZEL_CACHE}" ]] ; then
193199
BUILD_PARAM="${BUILD_PARAM} --bazel_options=--disk_cache=${BAZEL_CACHE}"
194200
fi
195201

202+
if [[ -n "${BAZEL_REPO_CACHE}" ]]; then
203+
BUILD_PARAM="${BUILD_PARAM} --bazel_options=--repository_cache=${BAZEL_REPO_CACHE}"
204+
fi
205+
196206
if [[ "$DEBUG" == "1" ]]; then
197207
BUILD_PARAM="${BUILD_PARAM} --bazel_options=-c --bazel_options=dbg --bazel_options=--strip=never --bazel_options=--cxxopt=-g --bazel_options=--cxxopt=-O0"
198208
fi
@@ -203,6 +213,7 @@ echo " Configuration "
203213
echo "--------------------------------------------------"
204214

205215
print_var BAZEL_CACHE
216+
print_var BAZEL_REPO_CACHE
206217
print_var BUILD_PATH_JAXLIB
207218
print_var BUILD_PARAM
208219
print_var CLEAN

.github/workflows/_ci.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,14 @@ jobs:
8282
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
8383
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
8484
github-token: ${{ secrets.GITHUB_TOKEN }}
85-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
85+
# Bazel remote cache URL intentionally omitted: the Dockerfile now
86+
# uses --disk_cache=/cache/bazel-disk (BuildKit cache mount) and
87+
# --repository_cache=/cache/bazel-repo (seeded via actions/cache).
88+
# Set BAZEL_REMOTE_CACHE_URL var in repo settings to re-enable the
89+
# legacy remote cache path (e.g. grpc://...) if needed.
90+
bazel-remote-cache-url: ""
91+
# Enable Bazel repository-cache save/restore for this Bazel-heavy job.
92+
ENABLE_BAZEL_REPO_CACHE: 'true'
8693
EXTRA_BUILD_ARGS: |
8794
URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
8895
URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}

0 commit comments

Comments
 (0)