Skip to content

Commit 53b10a0

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 450820a + 2595866 commit 53b10a0

File tree

3 files changed

+187
-0
lines changed

3 files changed

+187
-0
lines changed

OWNERS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
approvers:
22
- astefanutti
3+
- briangallagher
34
- chipspeak
45
- efazal
56
- Fiona-Waters
7+
- kramaranya
68
- kryanbeane
79
- laurafitzgerald
10+
- MStokluska
811
- pawelpaszki
912
- sutaakar
1013
- szaher
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Universal image Dockerfile
2+
#
3+
# Base image:
4+
# - Minimal Jupyter CUDA workbench with CUDA 12.8 and Python 3.12
5+
# - Provides JupyterLab, Elyra integration, addons, and default ENTRYPOINT start-notebook.sh
6+
# - Source: quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
7+
#
8+
# Design intent:
9+
# - Preserve workbench behavior by default (no args → start-notebook.sh)
10+
# - Add runtime capabilities on top (Python ML/training stack, RDMA/IB packages)
11+
# - Avoid duplicating dependencies provided by the base image
12+
# - Allow headless runtime mode when a command is provided (args → exec that command)
13+
14+
FROM quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
15+
16+
LABEL name="universal:py312-cuda128-torch280" \
17+
summary="Universal CUDA 12.8 Python 3.12 image with PyTorch 2.8.0" \
18+
description="Universal image combining minimal Jupyter workbench and runtime ML stack (CUDA 12.8, PyTorch 2.8.0, FlashAttention 2.8.3) on UBI9" \
19+
io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
20+
io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided. Includes RDMA/IB libs, Torch 2.8.0 cu128, FlashAttention 2.8.3."
21+
22+
## TODO: Add license file
23+
# COPY LICENSE.md /licenses/cuda-license.md
24+
25+
# For OS installs we need elevated privileges; base may default to 1001
26+
USER 0
27+
WORKDIR /opt/app-root/bin
28+
29+
# Keep NVIDIA driver capability constraints consistent with runtime image behavior
30+
ENV NVIDIA_VISIBLE_DEVICES=all \
31+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
32+
CUDA_VERSION=12.8 \
33+
PIP_DEFAULT_TIMEOUT=600 \
34+
PIP_DISABLE_PIP_VERSION_CHECK=1
35+
36+
# Follow runtime: enable CUDA and Mellanox OFED repositories for RDMA/IB packages.
37+
# Note: The base image already includes CUDA 12.8 runtime; we only add missing components (e.g., RDMA libs).
38+
RUN dnf config-manager \
39+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
40+
&& dnf config-manager \
41+
--add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
42+
&& dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,mlnx_ofed_24.10-1.1.4.0_base,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
43+
libibverbs-utils \
44+
infiniband-diags \
45+
libibumad3 \
46+
librdmacm \
47+
librdmacm-utils \
48+
rdma-core \
49+
mlnx-tools \
50+
&& dnf clean all \
51+
&& rm -rf /var/cache/dnf/*
52+
53+
# Install CUDA NVCC and build toolchain required to build FlashAttention from source
54+
# NOTE: Use command-line CUDA packages to avoid Nsight GUI deps (X11 libs) not available in UBI
55+
RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
56+
cuda-command-line-tools-12-8 \
57+
cuda-cudart-devel-12-8 \
58+
cuda-nvcc-12-8-12.8.93-1 \
59+
gcc \
60+
gcc-c++ \
61+
make \
62+
python3-devel \
63+
cmake \
64+
git \
65+
&& dnf clean all \
66+
&& rm -rf /var/cache/dnf/*
67+
68+
# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
69+
ENV CUDA_HOME=/usr/local/cuda \
70+
PATH=/usr/local/cuda/bin:$PATH \
71+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
72+
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
73+
74+
# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
75+
# Verify CUDA toolkit present and nvcc available
76+
RUN /usr/local/cuda/bin/nvcc -V
77+
# Verify key CUDA libs are discoverable
78+
RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || (echo "[fail-fast] CUDA libs not found in ldconfig" >&2; exit 1)
79+
80+
# Quick preflight: verify torch wheel and flash-attn index are reachable to fail fast before large downloads
81+
ARG TORCH_WHEEL_FILE=https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
82+
RUN curl -IfsS --connect-timeout 10 --max-time 20 "$TORCH_WHEEL_FILE" > /dev/null || (echo "[fail-fast] Torch cu128 wheel not reachable: $TORCH_WHEEL_FILE" >&2; exit 1)
83+
RUN curl -IfsS --connect-timeout 10 --max-time 20 https://pypi.org/simple/flash-attn/ > /dev/null || (echo "[fail-fast] PyPI flash-attn index not reachable" >&2; exit 1)
84+
85+
# Switch back to the non-root user for Python environment changes
86+
USER 1001
87+
88+
WORKDIR /opt/app-root/src
89+
90+
# Add runtime Python dependencies on top of the minimal Jupyter stack.
91+
# We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
92+
# Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
93+
ARG TARGETARCH
94+
# Enforce amd64 for FlashAttention wheel availability
95+
RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and requires amd64 prebuilt wheels. Build with --platform linux/amd64." >&2; exit 1; fi
96+
97+
# Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
98+
RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
99+
100+
# NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
101+
RUN python - <<'PY'
102+
import torch, sys
103+
print("[check] torch", torch.__version__, "cuda build:", torch.version.cuda)
104+
sys.exit(0 if torch.backends.cuda.is_built() else 1)
105+
PY
106+
107+
# Install numpy ahead of building extensions that expect it
108+
RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
109+
110+
# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
111+
RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
112+
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
113+
114+
# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
115+
RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
116+
117+
# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
118+
# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
119+
# to control CUDA/FA install order and indexes, and to reduce lock churn across arches/ABI-specific wheels.
120+
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
121+
flash-attn==2.8.3 --no-build-isolation \
122+
accelerate==1.10.0 \
123+
transformers==4.55.2 \
124+
peft==0.17.0 \
125+
tqdm==4.67.1 \
126+
datasets==4.0.0 \
127+
pydantic>=2.11.7 \
128+
aiofiles==24.1.0 \
129+
"protobuf>=5.28.0,<6.0.0" \
130+
"simpleeval>=0.9.13,<1.0" \
131+
safetensors==0.6.2 \
132+
packaging==25.0 \
133+
pyyaml==6.0.2 \
134+
py-cpuinfo==9.0.0 \
135+
numba==0.61.2 \
136+
rich==14.1.0 \
137+
tensorboard==2.19.0 \
138+
bitsandbytes>=0.45.3 \
139+
liger-kernel==0.5.10 \
140+
"sentencepiece>=0.1.99,<0.3" \
141+
tokenizers==0.21.4 \
142+
training-hub==0.2.0 \
143+
trl==0.21.0 \
144+
deepspeed>=0.14.3 \
145+
async-timeout==4.0.3 \
146+
aiohttp==3.12.15 \
147+
hf-xet==1.1.8 \
148+
huggingface-hub==0.34.4 \
149+
mlflow==3.4.0 \
150+
psutil==7.0.0 \
151+
&& chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
152+
&& fix-permissions /opt/app-root -P
153+
154+
# Provide a POSIX entrypoint wrapper to choose behavior based on invocation
155+
COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
156+
157+
# Set ENTRYPOINT to the wrapper so that providing a command runs headless.
158+
# Default CMD maintains workbench behavior (no args → start-notebook.sh)
159+
ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"]
160+
CMD ["start-notebook.sh"]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env sh
2+
set -e
3+
# Universal entrypoint
4+
# Behavior:
5+
# - If RUNTIME_MODE is set to a truthy value, run provided command (headless). If no command provided, exit with help.
6+
# - Otherwise, start the workbench notebook exactly like the base image.
7+
8+
is_truthy() {
9+
case "$(printf %s "$1" | tr '[:upper:]' '[:lower:]')" in
10+
y|yes|true|1) return 0 ;;
11+
*) return 1 ;;
12+
esac
13+
}
14+
15+
if is_truthy "${RUNTIME_MODE:-}"; then
16+
if [ "$#" -gt 0 ]; then
17+
exec "$@"
18+
else
19+
echo "RUNTIME_MODE=true but no command provided. Provide a command, e.g.: python -m your.module" >&2
20+
exit 2
21+
fi
22+
fi
23+
24+
exec start-notebook.sh

0 commit comments

Comments
 (0)