Skip to content

Commit ff5f03b

Browse files
add timeouts
Signed-off-by: Brian Gallagher <[email protected]>
1 parent e24db03 commit ff5f03b

File tree

4 files changed

+59
-13
lines changed

4 files changed

+59
-13
lines changed

.tekton/universal-image-py312-cuda128-torch280-pull-request.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ metadata:
1818
name: universal-image-py312-cuda128-torch280-on-pull-request
1919
namespace: open-data-hub-tenant
2020
spec:
21+
timeouts:
22+
pipeline: 9h
2123
params:
2224
- name: git-url
2325
value: '{{source_url}}'

.tekton/universal-image-py312-cuda128-torch280-push.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ metadata:
1717
name: universal-image-py312-cuda128-torch280-on-push
1818
namespace: open-data-hub-tenant
1919
spec:
20+
timeouts:
21+
pipeline: 9h
2022
params:
2123
- name: git-url
2224
value: '{{source_url}}'

images/universal/training/py312-cuda128-torch280/Dockerfile

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,25 @@ RUN dnf config-manager \
5454
RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
5555
cuda-command-line-tools-12-8 \
5656
cuda-cudart-devel-12-8 \
57+
cuda-libraries-devel-12-8 \
58+
cuda-compat-12-8 \
59+
cuda-compiler-12-8 \
5760
cuda-nvcc-12-8-12.8.93-1 \
5861
gcc \
5962
gcc-c++ \
6063
make \
6164
python3-devel \
6265
cmake \
6366
git \
67+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
68+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
6469
&& dnf clean all \
6570
&& rm -rf /var/cache/dnf/*
6671

6772
# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
6873
ENV CUDA_HOME=/usr/local/cuda \
69-
PATH=/usr/local/cuda/bin:$PATH \
70-
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
74+
PATH=/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH} \
75+
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH \
7176
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
7277

7378
# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
@@ -86,6 +91,10 @@ USER 1001
8691

8792
WORKDIR /opt/app-root/src
8893

94+
# Ensure user installs land in app-root and are discoverable by Python
95+
ENV PYTHONUSERBASE=/opt/app-root \
96+
PYTHONNOUSERSITE=0
97+
8998
# Add runtime Python dependencies on top of the minimal Jupyter stack.
9099
# We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
91100
# Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
@@ -94,7 +103,11 @@ ARG TARGETARCH
94103
RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and requires amd64 prebuilt wheels. Build with --platform linux/amd64." >&2; exit 1; fi
95104

96105
# Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
97-
RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
106+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
107+
108+
# Diagnostics: show interpreter, sys.path, and user site locations
109+
RUN python -c "import sys,site,os; print('exe:',sys.executable); print('sys.path:',sys.path); print('userbase:',site.getuserbase()); print('usersite:',site.getusersitepackages()); print('PYTHONNOUSERSITE=',os.environ.get('PYTHONNOUSERSITE'))"
110+
RUN python -m pip show torch || true && python -c "import importlib.util; print('torch_spec:', importlib.util.find_spec('torch'))"
98111

99112
# NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
100113
RUN python - <<'PY'
@@ -104,19 +117,19 @@ sys.exit(0 if torch.backends.cuda.is_built() else 1)
104117
PY
105118

106119
# Install numpy ahead of building extensions that expect it
107-
RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
120+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.4
108121

109122
# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
110-
RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
111-
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
123+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
124+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
112125

113-
# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
114-
RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
126+
# Ensure modern build tooling for extensions
127+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir -U pip setuptools wheel ninja cmake
115128

116129
# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
117130
# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
118131
# to control CUDA/FA install order and indexes, and to reduce lock churn across arches/ABI-specific wheels.
119-
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
132+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir \
120133
flash-attn==2.8.3 --no-build-isolation \
121134
accelerate==1.10.0 \
122135
transformers==4.57.1 \
@@ -154,10 +167,38 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
154167
&& chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
155168
&& fix-permissions /opt/app-root -P
156169

157-
# Deterministic 2-step: sub-dep first, then parent without deps (align with runtime)
158-
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation causal-conv1d==1.5.3.post1 && \
159-
pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation mamba-ssm==2.2.6.post3 --no-deps && \
160-
fix-permissions /opt/app-root -P
170+
# Provide CUDA user-space libraries via pip, aligning with runtime for extension builds
171+
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir \
172+
nvidia-nccl-cu12==2.27.3 \
173+
nvidia-cublas-cu12==12.8.4.1 \
174+
nvidia-cuda-cupti-cu12==12.8.90 \
175+
nvidia-cuda-nvrtc-cu12==12.8.93 \
176+
nvidia-cuda-runtime-cu12==12.8.90 \
177+
nvidia-cudnn-cu12==9.10.2.21 \
178+
nvidia-cufft-cu12==11.3.3.83 \
179+
nvidia-cufile-cu12==1.13.1.3 \
180+
nvidia-curand-cu12==10.3.9.90 \
181+
nvidia-cusolver-cu12==11.7.3.90 \
182+
nvidia-cusparse-cu12==12.5.8.93 \
183+
nvidia-cusparselt-cu12==0.7.1 \
184+
nvidia-nvjitlink-cu12==12.8.93 \
185+
nvidia-nvtx-cu12==12.8.90 \
186+
&& fix-permissions /opt/app-root -P
187+
188+
# Ensure cuDNN from pip is discoverable during source builds
189+
ENV LD_LIBRARY_PATH="/opt/app-root/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}"
190+
191+
# # Deterministic 2-step with verbose logging for debugging builds
192+
# # 1) Build and log wheel, 2) Install from local wheel (keeps logs on failure)
193+
# RUN python -m pip install --no-cache-dir --no-build-isolation -vv --log /tmp/pip-causal.log causal-conv1d==1.5.3.post1 && \
194+
# CMAKE_GENERATOR=Ninja CMAKE_ARGS="-DCMAKE_VERBOSE_MAKEFILE=ON" \
195+
# python -m pip wheel --no-cache-dir --no-build-isolation -vv \
196+
# --log /tmp/pip-mamba-ssm.log \
197+
# mamba-ssm==2.2.6.post3 --no-deps -w /tmp/wheels || \
198+
# (echo '--- pip mamba-ssm log (tail) ---'; tail -n 500 /tmp/pip-mamba-ssm.log; exit 1) && \
199+
# python -m pip install --no-cache-dir /tmp/wheels/*.whl && \
200+
# rm -rf /tmp/wheels && \
201+
# fix-permissions /opt/app-root -P
161202

162203
# Provide a POSIX entrypoint wrapper to choose behavior based on invocation
163204
COPY --chmod=0755 ./images/universal/training/py312-cuda128-torch280/entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh

images/universal/training/py312-cuda128-torch280/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ CUDA enabled container image for Training Workbench and Training Runtime in Open
44

55
It includes the following layers:
66
* UBI 9
7+
* Minimal Workbench
78
* Python 3.12
89
* CUDA 12.8
910
* PyTorch 2.8.0

0 commit comments

Comments
 (0)