Skip to content

Commit 2f92670

Browse files
add timeouts
Signed-off-by: Brian Gallagher <[email protected]>
1 parent e29c1ca commit 2f92670

File tree

5 files changed

+348
-9
lines changed

5 files changed

+348
-9
lines changed

.tekton/universal-image-py312-cuda128-torch280-pull-request.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ metadata:
1818
name: universal-image-py312-cuda128-torch280-on-pull-request
1919
namespace: open-data-hub-tenant
2020
spec:
21+
timeouts:
22+
pipeline: 9h
2123
params:
2224
- name: git-url
2325
value: '{{source_url}}'

.tekton/universal-image-py312-cuda128-torch280-push.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ metadata:
1717
name: universal-image-py312-cuda128-torch280-on-push
1818
namespace: open-data-hub-tenant
1919
spec:
20+
timeouts:
21+
pipeline: 9h
2022
params:
2123
- name: git-url
2224
value: '{{source_url}}'

images/universal/training/py312-cuda128-torch280/Dockerfile

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ LABEL name="universal:py312-cuda128-torch280" \
1919
io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
2020
io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided. Includes RDMA/IB libs, Torch 2.8.0 cu128, FlashAttention 2.8.3."
2121

22-
COPY ./images/universal/training/py312-cuda128-torch280/LICENSE.md /licenses/cuda-license.md
22+
## TODO: Add license file
23+
# COPY LICENSE.md /licenses/cuda-license.md
2324

2425
# For OS installs we need elevated privileges; base may default to 1001
2526
USER 0
@@ -108,10 +109,10 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
108109

109110
# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
110111
RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
111-
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
112+
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@add-training-hub"
112113

113114
# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
114-
RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
115+
# RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
115116

116117
# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
117118
# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
@@ -125,6 +126,8 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
125126
datasets==4.0.0 \
126127
pydantic>=2.11.7 \
127128
aiofiles==24.1.0 \
129+
deprecated==1.2.18 \
130+
typer==0.19.2 \
128131
"protobuf>=5.28.0,<6.0.0" \
129132
"simpleeval>=0.9.13,<1.0" \
130133
safetensors==0.6.2 \
@@ -140,9 +143,6 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
140143
kernels==0.10.3 \
141144
"sentencepiece>=0.1.99,<0.3" \
142145
tokenizers==0.22.1 \
143-
instructlab-training==0.12.1 \
144-
rhai-innovation-mini-trainer==0.3.0 \
145-
training-hub==0.3.0 \
146146
trl==0.21.0 \
147147
deepspeed>=0.14.3 \
148148
async-timeout==4.0.3 \
@@ -151,12 +151,22 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
151151
huggingface-hub==0.34.4 \
152152
mlflow==3.4.0 \
153153
psutil==7.0.0 \
154+
training-hub==0.3.0 \
155+
instructlab-training==0.12.1 \
156+
rhai-innovation-mini-trainer==0.3.0 \
154157
&& chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
155158
&& fix-permissions /opt/app-root -P
156159

157-
# Deterministic 2-step: sub-dep first, then parent without deps (align with runtime)
158-
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation causal-conv1d==1.5.3.post1 && \
159-
pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation mamba-ssm==2.2.6.post3 --no-deps && \
160+
161+
# Build helpers to compile PyTorch extensions
162+
RUN pip install -U pip setuptools wheel ninja cmake
163+
164+
# Optional: set GPU archs if you hit arch issues
165+
# ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
166+
167+
# Deterministic 2-step: sub-dep first, then parent without deps
168+
RUN pip install --no-build-isolation --no-cache-dir causal-conv1d==1.5.3.post1 && \
169+
pip install --no-build-isolation --no-cache-dir mamba-ssm==2.2.6.post3 --no-deps && \
160170
fix-permissions /opt/app-root -P
161171

162172
# Provide a POSIX entrypoint wrapper to choose behavior based on invocation

0 commit comments

Comments
 (0)