add timeouts

briangallagher · briangallagher · commit d71708dac1ea · 2025-11-09T17:38:18.000Z
Signed-off-by: Brian Gallagher &lt;briangal@gmail.com&gt;
diff --git a/.tekton/universal-image-py312-cuda128-torch280-pull-request.yaml b/.tekton/universal-image-py312-cuda128-torch280-pull-request.yaml
@@ -18,6 +18,8 @@ metadata:
   name: universal-image-py312-cuda128-torch280-on-pull-request
   namespace: open-data-hub-tenant
 spec:
+  timeouts:
+    pipeline: 9h  
   params:
   - name: git-url
     value: '{{source_url}}'
diff --git a/.tekton/universal-image-py312-cuda128-torch280-push.yaml b/.tekton/universal-image-py312-cuda128-torch280-push.yaml
@@ -17,6 +17,8 @@ metadata:
   name: universal-image-py312-cuda128-torch280-on-push
   namespace: open-data-hub-tenant
 spec:
+  timeouts:
+    pipeline: 9h  
   params:
   - name: git-url
     value: '{{source_url}}'
diff --git a/images/universal/training/py312-cuda128-torch280/Dockerfile b/images/universal/training/py312-cuda128-torch280/Dockerfile
@@ -54,20 +54,25 @@ RUN dnf config-manager \
 RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
     cuda-command-line-tools-12-8 \
     cuda-cudart-devel-12-8 \
+    cuda-libraries-devel-12-8 \
+    cuda-compat-12-8 \
+    cuda-compiler-12-8 \
     cuda-nvcc-12-8-12.8.93-1 \
     gcc \
     gcc-c++ \
     make \
     python3-devel \
     cmake \
     git \
+ && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+ && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
  && dnf clean all \
  && rm -rf /var/cache/dnf/*
 
 # Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
 ENV CUDA_HOME=/usr/local/cuda \
-    PATH=/usr/local/cuda/bin:$PATH \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
+    PATH=/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH} \
+    LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH \
     TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
 
 # NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
@@ -86,6 +91,10 @@ USER 1001
 
 WORKDIR /opt/app-root/src
 
+# Ensure user installs land in app-root and are discoverable by Python
+ENV PYTHONUSERBASE=/opt/app-root \
+    PYTHONNOUSERSITE=0
+
 # Add runtime Python dependencies on top of the minimal Jupyter stack.
 # We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
 # Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
@@ -96,6 +105,9 @@ RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and
 # Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
 RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
 
+# Diagnostics: show interpreter, sys.path, and user site locations
+RUN python -c "import sys,site,os; print('exe:',sys.executable); print('sys.path:',sys.path); print('userbase:',site.getuserbase()); print('usersite:',site.getusersitepackages()); print('PYTHONNOUSERSITE=',os.environ.get('PYTHONNOUSERSITE'))"
+
 # NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
 RUN python - <<'PY'
 import torch, sys
@@ -110,8 +122,8 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
 RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
 RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
 
-# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
-RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
+# Ensure modern build tooling for extensions
+RUN pip install --retries 5 --timeout 300 --no-cache-dir -U pip setuptools wheel ninja cmake
 
 # Install remaining runtime packages (resolved from default PyPI), including FlashAttention
 # Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
@@ -154,9 +166,37 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
  && chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
  && fix-permissions /opt/app-root -P
 
-# Deterministic 2-step: sub-dep first, then parent without deps (align with runtime)
-RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation causal-conv1d==1.5.3.post1 && \
-    pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation mamba-ssm==2.2.6.post3 --no-deps && \
+# Provide CUDA user-space libraries via pip, aligning with runtime for extension builds
+RUN pip install --retries 5 --timeout 300 --no-cache-dir \
+    nvidia-nccl-cu12==2.27.3 \
+    nvidia-cublas-cu12==12.8.4.1 \
+    nvidia-cuda-cupti-cu12==12.8.90 \
+    nvidia-cuda-nvrtc-cu12==12.8.93 \
+    nvidia-cuda-runtime-cu12==12.8.90 \
+    nvidia-cudnn-cu12==9.10.2.21 \
+    nvidia-cufft-cu12==11.3.3.83 \
+    nvidia-cufile-cu12==1.13.1.3 \
+    nvidia-curand-cu12==10.3.9.90 \
+    nvidia-cusolver-cu12==11.7.3.90 \
+    nvidia-cusparse-cu12==12.5.8.93 \
+    nvidia-cusparselt-cu12==0.7.1 \
+    nvidia-nvjitlink-cu12==12.8.93 \
+    nvidia-nvtx-cu12==12.8.90 \
+ && fix-permissions /opt/app-root -P
+
+# Ensure cuDNN from pip is discoverable during source builds
+ENV LD_LIBRARY_PATH="/opt/app-root/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}"
+
+# Deterministic 2-step with verbose logging for debugging builds
+# 1) Build and log wheel, 2) Install from local wheel (keeps logs on failure)
+RUN pip install --no-cache-dir --no-build-isolation -vv --log /tmp/pip-causal.log causal-conv1d==1.5.3.post1 && \
+    CMAKE_GENERATOR=Ninja CMAKE_ARGS="-DCMAKE_VERBOSE_MAKEFILE=ON" \
+    pip wheel --no-cache-dir --no-build-isolation -vv \
+      --log /tmp/pip-mamba-ssm.log \
+      mamba-ssm==2.2.6.post3 --no-deps -w /tmp/wheels || \
+    (echo '--- pip mamba-ssm log (tail) ---'; tail -n 500 /tmp/pip-mamba-ssm.log; exit 1) && \
+    pip install --no-cache-dir /tmp/wheels/*.whl && \
+    rm -rf /tmp/wheels && \
     fix-permissions /opt/app-root -P
 
 # Provide a POSIX entrypoint wrapper to choose behavior based on invocation
diff --git a/images/universal/training/py312-cuda128-torch280/README.md b/images/universal/training/py312-cuda128-torch280/README.md
@@ -4,6 +4,7 @@ CUDA enabled container image for Training Workbench and Training Runtime in Open
 
 It includes the following layers:
 * UBI 9
+* Minimal Workbench
 * Python 3.12
 * CUDA 12.8
 * PyTorch 2.8.0