Skip to content

Commit d62c537

Browse files
add timeouts
Signed-off-by: Brian Gallagher <[email protected]>
1 parent e24db03 commit d62c537

File tree

6 files changed

+178
-17
lines changed

6 files changed

+178
-17
lines changed

.tekton/universal-image-py312-cuda128-torch280-pull-request.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ metadata:
1818
name: universal-image-py312-cuda128-torch280-on-pull-request
1919
namespace: open-data-hub-tenant
2020
spec:
21+
timeouts:
22+
pipeline: 9h
2123
params:
2224
- name: git-url
2325
value: '{{source_url}}'

.tekton/universal-image-py312-cuda128-torch280-push.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ metadata:
1717
name: universal-image-py312-cuda128-torch280-on-push
1818
namespace: open-data-hub-tenant
1919
spec:
20+
timeouts:
21+
pipeline: 9h
2022
params:
2123
- name: git-url
2224
value: '{{source_url}}'

images/runtime/.DS_Store

8 KB
Binary file not shown.

images/runtime/training/.DS_Store

8 KB
Binary file not shown.

images/universal/training/py312-cuda128-torch280/Dockerfile

Lines changed: 173 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ ENV NVIDIA_VISIBLE_DEVICES=all \
3030
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
3131
CUDA_VERSION=12.8 \
3232
PIP_DEFAULT_TIMEOUT=600 \
33-
PIP_DISABLE_PIP_VERSION_CHECK=1
33+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
34+
PIP_UPGRADE_STRATEGY=only-if-needed
3435

3536
# Follow runtime: enable CUDA and Mellanox OFED repositories for RDMA/IB packages.
3637
# Note: The base image already includes CUDA 12.8 runtime; we only add missing components (e.g., RDMA libs).
@@ -54,38 +55,64 @@ RUN dnf config-manager \
5455
RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
5556
cuda-command-line-tools-12-8 \
5657
cuda-cudart-devel-12-8 \
58+
cuda-libraries-devel-12-8 \
59+
cuda-compat-12-8 \
60+
cuda-compiler-12-8 \
5761
cuda-nvcc-12-8-12.8.93-1 \
5862
gcc \
5963
gcc-c++ \
6064
make \
6165
python3-devel \
6266
cmake \
6367
git \
68+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
69+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
6470
&& dnf clean all \
6571
&& rm -rf /var/cache/dnf/*
6672

6773
# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
74+
ARG CUDA_ARCH_LIST=9.0
6875
ENV CUDA_HOME=/usr/local/cuda \
69-
PATH=/usr/local/cuda/bin:$PATH \
70-
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
71-
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
76+
PATH=/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH} \
77+
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH \
78+
TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST}"
7279

80+
# Extra verbosity and progress for CUDA extension builds (Ninja progress, Torch verbose)
81+
ENV USE_NINJA=1 \
82+
TORCH_CUDA_VERBOSE_BUILD=1 \
83+
NINJA_STATUS="[%f/%t %es] "
7384
# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
7485
# Verify CUDA toolkit present and nvcc available
7586
RUN /usr/local/cuda/bin/nvcc -V
7687
# Verify key CUDA libs are discoverable
7788
RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || (echo "[fail-fast] CUDA libs not found in ldconfig" >&2; exit 1)
89+
RUN mkdir -p /opt/app-root/.ccache && chown -R 1001:0 /opt/app-root/.ccache
7890

7991
# Quick preflight: verify torch wheel and flash-attn index are reachable to fail fast before large downloads
8092
ARG TORCH_WHEEL_FILE=https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
8193
RUN curl -IfsS --connect-timeout 10 --max-time 20 "$TORCH_WHEEL_FILE" > /dev/null || (echo "[fail-fast] Torch cu128 wheel not reachable: $TORCH_WHEEL_FILE" >&2; exit 1)
8294
RUN curl -IfsS --connect-timeout 10 --max-time 20 https://pypi.org/simple/flash-attn/ > /dev/null || (echo "[fail-fast] PyPI flash-attn index not reachable" >&2; exit 1)
95+
#
96+
# Additional diagnostics to help debug build env before Python installs
97+
RUN python -m pip debug --verbose || true
98+
RUN gcc --version | head -n1 || true
99+
RUN g++ --version | head -n1 || true
100+
RUN /usr/local/cuda/bin/nvcc -V | head -n1 || true
83101

84102
# Switch back to the non-root user for Python environment changes
85103
USER 1001
86104

87105
WORKDIR /opt/app-root/src
88106

107+
# Ensure user installs land in app-root and are discoverable by Python
108+
ENV PYTHONUSERBASE=/opt/app-root \
109+
PYTHONNOUSERSITE=0
110+
111+
# Speed up repeated native builds using ccache (if present)
112+
ENV CCACHE_DIR=/opt/app-root/.ccache \
113+
CCACHE_MAXSIZE=5G \
114+
CCACHE_COMPRESS=1
115+
89116
# Add runtime Python dependencies on top of the minimal Jupyter stack.
90117
# We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
91118
# Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
@@ -94,7 +121,11 @@ ARG TARGETARCH
94121
RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and requires amd64 prebuilt wheels. Build with --platform linux/amd64." >&2; exit 1; fi
95122

96123
# Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
97-
RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
124+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
125+
126+
# Diagnostics: show interpreter, sys.path, and user site locations
127+
RUN python -c "import sys,site,os; print('exe:',sys.executable); print('sys.path:',sys.path); print('userbase:',site.getuserbase()); print('usersite:',site.getusersitepackages()); print('PYTHONNOUSERSITE=',os.environ.get('PYTHONNOUSERSITE'))"
128+
RUN python -m pip show torch || true && python -c "import importlib.util; print('torch_spec:', importlib.util.find_spec('torch'))"
98129

99130
# NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
100131
RUN python - <<'PY'
@@ -104,19 +135,32 @@ sys.exit(0 if torch.backends.cuda.is_built() else 1)
104135
PY
105136

106137
# Install numpy ahead of building extensions that expect it
107-
RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
138+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.4
108139

109140
# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
110-
RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
111-
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
141+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
142+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
143+
#
144+
# NOTE: kubeflow pulls kubernetes==34.x which requires urllib3<2.4, but
145+
# training-hub requires urllib3>=2.4. There is no kubernetes>=35 on PyPI yet.
146+
# We intentionally keep urllib3>=2.4 for training-hub and accept the mismatch.
147+
# To avoid resolution failure, we do NOT try to force-upgrade kubernetes here.
148+
# RUN python -m pip install --retries 5 --timeout 180 --no-cache-dir \
149+
# "kubernetes>=35.0.0" "urllib3>=2.4,<3"
112150

113-
# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
114-
RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
151+
# Ensure modern build tooling for extensions
152+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir -U pip setuptools wheel ninja cmake
153+
#
154+
# Fail-fast: ensure binary wheels exist for packages that are expensive to build
155+
RUN mkdir -p /tmp/wheels && \
156+
python -m pip download --retries 5 --timeout 120 --only-binary=:all: --no-deps -d /tmp/wheels \
157+
numba==0.62.1 bitsandbytes==0.48.1 || \
158+
(echo "[fail-fast] Missing binary wheel for numba or bitsandbytes on this platform (refusing to build from source)." >&2; exit 1)
115159

116160
# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
117161
# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
118162
# to control CUDA/FA install order and indexes, and to reduce lock churn across arches/ABI-specific wheels.
119-
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
163+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --prefer-binary --only-binary=numba,bitsandbytes --upgrade-strategy only-if-needed \
120164
flash-attn==2.8.3 --no-build-isolation \
121165
accelerate==1.10.0 \
122166
transformers==4.57.1 \
@@ -125,6 +169,8 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
125169
datasets==4.0.0 \
126170
pydantic>=2.11.7 \
127171
aiofiles==24.1.0 \
172+
deprecated==1.2.18 \
173+
typer==0.19.2 \
128174
"protobuf>=5.28.0,<6.0.0" \
129175
"simpleeval>=0.9.13,<1.0" \
130176
safetensors==0.6.2 \
@@ -140,9 +186,6 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
140186
kernels==0.10.3 \
141187
"sentencepiece>=0.1.99,<0.3" \
142188
tokenizers==0.22.1 \
143-
instructlab-training==0.12.1 \
144-
rhai-innovation-mini-trainer==0.3.0 \
145-
training-hub==0.3.0 \
146189
trl==0.21.0 \
147190
deepspeed>=0.14.3 \
148191
async-timeout==4.0.3 \
@@ -151,12 +194,125 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
151194
huggingface-hub==0.34.4 \
152195
mlflow==3.4.0 \
153196
psutil==7.0.0 \
197+
training-hub==0.3.0 \
198+
instructlab-training==0.12.1 \
199+
rhai-innovation-mini-trainer==0.3.0 \
154200
&& chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
155201
&& fix-permissions /opt/app-root -P
202+
#
203+
# WARNING: Skipping `pip check` due to known kubernetes(urllib3<2.4) vs
204+
# training-hub(urllib3>=2.4) requirement mismatch. Re-enable once upstream
205+
# loosens kubernetes urllib3 bounds or kubeflow no longer pins 34.x.
206+
# RUN python -m pip check || (python -m pip freeze; exit 1)
207+
#
208+
# Numba diagnostics (helps catch llvmlite/LLVM/NumPy mismatches quickly)
209+
RUN python -m numba -s || (echo "[diagnostics] numba sysinfo failed" >&2; exit 1)
210+
211+
# Provide CUDA user-space libraries via pip, aligning with runtime for extension builds
212+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir \
213+
nvidia-nccl-cu12==2.27.3 \
214+
nvidia-cublas-cu12==12.8.4.1 \
215+
nvidia-cuda-cupti-cu12==12.8.90 \
216+
nvidia-cuda-nvrtc-cu12==12.8.93 \
217+
nvidia-cuda-runtime-cu12==12.8.90 \
218+
nvidia-cudnn-cu12==9.10.2.21 \
219+
nvidia-cufft-cu12==11.3.3.83 \
220+
nvidia-cufile-cu12==1.13.1.3 \
221+
nvidia-curand-cu12==10.3.9.90 \
222+
nvidia-cusolver-cu12==11.7.3.90 \
223+
nvidia-cusparse-cu12==12.5.8.93 \
224+
nvidia-cusparselt-cu12==0.7.1 \
225+
nvidia-nvjitlink-cu12==12.8.93 \
226+
nvidia-nvtx-cu12==12.8.90 \
227+
&& fix-permissions /opt/app-root -P
228+
229+
# Ensure cuDNN from pip is discoverable during source builds
230+
ENV LD_LIBRARY_PATH="/opt/app-root/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}"
156231

157-
# Deterministic 2-step: sub-dep first, then parent without deps (align with runtime)
158-
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation causal-conv1d==1.5.3.post1 && \
159-
pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation mamba-ssm==2.2.6.post3 --no-deps && \
232+
# Deterministic 2-step with logging; optional ccache + parallelism for speed
233+
# 1) Build and log wheel; 2) Install from local wheel
234+
RUN set -e; \
235+
if command -v ccache >/dev/null 2>&1; then \
236+
export CC="ccache gcc" CXX="ccache g++" CUDAHOSTCXX="ccache g++"; \
237+
CC_LAUNCH='-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache'; \
238+
else \
239+
export CC="gcc" CXX="g++" CUDAHOSTCXX="g++"; \
240+
CC_LAUNCH=''; \
241+
fi; \
242+
ARCH_NO_DOT="$(echo "${CUDA_ARCH_LIST}" | tr -d '.')" ; \
243+
export TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" ; \
244+
export PYTORCH_NVCC_FLAGS="${PYTORCH_NVCC_FLAGS:-} -gencode arch=compute_${ARCH_NO_DOT},code=sm_${ARCH_NO_DOT}" ; \
245+
MAX_JOBS="$(nproc)" TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" PYTORCH_NVCC_FLAGS="${PYTORCH_NVCC_FLAGS}" python -m pip install --no-cache-dir --no-build-isolation -vv --log /tmp/pip-causal.log causal-conv1d==1.5.3.post1 && \
246+
CMAKE_GENERATOR=Ninja CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" MAX_JOBS="$(nproc)" CMAKE_ARGS="-DCMAKE_VERBOSE_MAKEFILE=OFF -DCMAKE_BUILD_TYPE=Release ${CC_LAUNCH} -DCMAKE_CUDA_ARCHITECTURES=${ARCH_NO_DOT}" \
247+
echo "[diag] nproc=$(nproc)"; df -h /tmp || true; cmake --version || true; ninja --version || true; which nvcc && nvcc -V || true; which g++ && g++ --version | head -n1 || true; \
248+
export FORCE_CUDA=1 TORCH_EXTENSIONS_VERBOSE=0; \
249+
printf '%s\n' \
250+
'#!/usr/bin/env bash' \
251+
'set -euo pipefail' \
252+
'build_log=/tmp/pip-mamba-ssm.log' \
253+
'mkdir -p /tmp/wheels' \
254+
': > "$build_log"' \
255+
'echo "[diag] starting wheel build at: $(date)"' \
256+
'TIME_LIMIT_SECS=1200' \
257+
'ARCH_NO_DOT="$(echo "${CUDA_ARCH_LIST:-9.0}" | tr -d ".")"' \
258+
'export TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST:-9.0}"' \
259+
'export PYTORCH_NVCC_FLAGS="${PYTORCH_NVCC_FLAGS:-} -gencode arch=compute_${ARCH_NO_DOT},code=sm_${ARCH_NO_DOT}"' \
260+
'if command -v ccache >/dev/null 2>&1; then' \
261+
' export CC="ccache gcc" CXX="ccache g++" CUDAHOSTCXX="ccache g++"' \
262+
' CC_LAUNCH="-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache"' \
263+
'else' \
264+
' export CC="gcc" CXX="g++" CUDAHOSTCXX="g++"' \
265+
' CC_LAUNCH=""' \
266+
'fi' \
267+
'export CMAKE_GENERATOR=Ninja CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" MAX_JOBS="$(nproc)"' \
268+
'export CMAKE_ARGS="-DCMAKE_VERBOSE_MAKEFILE=OFF -DCMAKE_BUILD_TYPE=Release ${CC_LAUNCH} -DCMAKE_CUDA_ARCHITECTURES=${ARCH_NO_DOT}"' \
269+
'export FORCE_CUDA=1 TORCH_EXTENSIONS_VERBOSE=0' \
270+
'rm -rf "$HOME/.cache/torch_extensions" /tmp/wheels/* || true' \
271+
'PYTHON_WHEEL_CMD='\''python -m pip wheel --no-cache-dir --no-build-isolation -vv'\''' \
272+
'$PYTHON_WHEEL_CMD \' \
273+
' --log "$build_log" \' \
274+
' mamba-ssm==2.2.6.post3 --no-deps -w /tmp/wheels &' \
275+
'pid=$!' \
276+
'echo "[diag] following pip log at $build_log" >&2' \
277+
'stdbuf -oL -eL tail -F "$build_log" >&2 & tail_pid=$!' \
278+
'start=$(date +%s)' \
279+
'while kill -0 "$pid" 2>/dev/null; do' \
280+
' now=$(date +%s)' \
281+
' elapsed=$(( now - start ))' \
282+
' minute=$(( elapsed / 60 ))' \
283+
' echo "[heartbeat] $(date): elapsed=${elapsed}s (minute=${minute})"' \
284+
' echo "[heartbeat] $(date): elapsed=${elapsed}s (minute=${minute})" >&2' \
285+
' echo "[heartbeat] ps snapshot:" >&2' \
286+
' ps -o pid,pgid,stat,etime,pcpu,pmem,comm,args -p "$pid" || true' \
287+
' echo "[heartbeat] compiled objects so far: $(find /tmp -type f -name "*.o" | wc -l || true)"' \
288+
' echo "[heartbeat] compiled objects so far: $(find /tmp -type f -name "*.o" | wc -l || true)" >&2' \
289+
' df -h /tmp || true' \
290+
' df -h /tmp >&2 || true' \
291+
' if [ "$elapsed" -ge "$TIME_LIMIT_SECS" ]; then' \
292+
' echo "[timeout] wheel build exceeded $(( TIME_LIMIT_SECS / 60 )) minutes; killing...";' \
293+
' pgid=$(ps -o pgid= "$pid" 2>/dev/null | tr -d " " || true);' \
294+
' if [ -n "$pgid" ]; then' \
295+
' echo "[timeout] killing process group ${pgid}";' \
296+
' kill -TERM -"${pgid}" || true; sleep 10; kill -KILL -"${pgid}" || true;' \
297+
' fi' \
298+
' kill "$pid" || true; sleep 10; kill -9 "$pid" || true;' \
299+
' echo "--- pip mamba-ssm log (tail) ---"; tail -n 1000 "$build_log" || true;' \
300+
' exit 1;' \
301+
' fi' \
302+
' sleep 15' \
303+
'done' \
304+
'wait "$pid"; status=$?' \
305+
'kill "$tail_pid" 2>/dev/null || true' \
306+
'if [ "$status" -ne 0 ]; then' \
307+
' echo "[fail] wheel build exited with status $status";' \
308+
' echo "--- pip mamba-ssm log (tail) ---"; tail -n 1000 "$build_log" || true;' \
309+
' exit "$status";' \
310+
'fi' \
311+
> /tmp/build_mamba_wheel.sh && \
312+
chmod +x /tmp/build_mamba_wheel.sh && \
313+
timeout -k 120s 22m bash /tmp/build_mamba_wheel.sh || (echo '--- pip mamba-ssm log (tail) ---'; tail -n 1000 /tmp/pip-mamba-ssm.log; exit 1) && \
314+
python -m pip install --no-cache-dir /tmp/wheels/*.whl && \
315+
rm -rf /tmp/wheels && \
160316
fix-permissions /opt/app-root -P
161317

162318
# Provide a POSIX entrypoint wrapper to choose behavior based on invocation

images/universal/training/py312-cuda128-torch280/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ CUDA enabled container image for Training Workbench and Training Runtime in Open
44

55
It includes the following layers:
66
* UBI 9
7+
* Minimal Workbench
78
* Python 3.12
89
* CUDA 12.8
910
* PyTorch 2.8.0

0 commit comments

Comments
 (0)