Skip to content

Commit 4102ee1

Browse files
universal-image-th04
1 parent 8c04a90 commit 4102ee1

File tree

8 files changed

+5197
-0
lines changed

8 files changed

+5197
-0
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
# Universal Image Dockerfile
2+
#
3+
# FIPS-friendly Features:
4+
# - uv is used only in build stage (not shipped in runtime image)
5+
# - Build tools are isolated in intermediate stages
6+
# - Final image contains only runtime dependencies
7+
# - OpenSSL FIPS mode supported via base image
8+
9+
################################################################################
10+
# Build Arguments
11+
################################################################################
12+
ARG BASE_IMAGE=quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
13+
# CUDA_VERSION: Used for environment variables and documentation purposes
14+
# - Sets CUDA_VERSION env var (helps with debugging and tooling)
15+
# - Not used for package installation (specific versions hardcoded below)
16+
ARG CUDA_VERSION=12.8
17+
# PYTHON_VERSION: Critical for path resolution in multi-stage build
18+
# - Used to locate site-packages directory (e.g., /opt/app-root/lib/python3.12/site-packages)
19+
# - Must match base image Python version
20+
ARG PYTHON_VERSION=3.12
21+
22+
################################################################################
23+
# Builder Stage - Install uv for dependency resolution
24+
################################################################################
25+
FROM ${BASE_IMAGE} AS builder
26+
27+
USER 0
28+
WORKDIR /tmp/builder
29+
30+
# Install latest version of uv in builder stage
31+
# Why: Even if base image has uv, we want the latest version for:
32+
# - Latest bug fixes and performance improvements
33+
# - Consistent behavior across builds
34+
# - Specific version control independent of base image
35+
# Note: This uv is isolated in builder stage and copied selectively to other stages
36+
RUN pip install --no-cache-dir uv
37+
38+
################################################################################
39+
# Base Stage
40+
################################################################################
41+
FROM ${BASE_IMAGE} AS base
42+
43+
LABEL name="universal:py312-cuda128-torch290" \
44+
summary="Universal CUDA 12.8 Python 3.12 image with PyTorch 2.9.0" \
45+
description="Universal image combining minimal Jupyter workbench and runtime ML stack (CUDA 12.8, PyTorch 2.9.0, FlashAttention 2.8.3) on UBI9" \
46+
io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
47+
io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided."
48+
49+
# Copy license file
50+
COPY LICENSE.md /licenses/cuda-license.md
51+
52+
USER 0
53+
WORKDIR /opt/app-root/bin
54+
55+
# Environment variables for NVIDIA and CUDA
56+
ENV NVIDIA_VISIBLE_DEVICES=all \
57+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
58+
CUDA_VERSION=${CUDA_VERSION} \
59+
CUDA_HOME=/usr/local/cuda \
60+
PATH=/usr/local/cuda/bin:$PATH \
61+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
62+
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
63+
XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
64+
65+
################################################################################
66+
# System Dependencies Stage
67+
################################################################################
68+
FROM base AS system-deps
69+
70+
USER 0
71+
WORKDIR /opt/app-root/bin
72+
73+
# Copy repository configuration files
74+
COPY cuda.repo mellanox.repo /etc/yum.repos.d/
75+
76+
# Install system packages (RDMA, CUDA tools, build toolchain)
77+
# Package list defined in rpms.in.yaml for documentation
78+
#
79+
# RDMA/InfiniBand packages (from mellanox.repo):
80+
# - libibverbs-utils, infiniband-diags: RDMA diagnostics and utilities
81+
# - libibumad: User-space MAD (Management Datagram) library for InfiniBand
82+
# - librdmacm, librdmacm-utils: RDMA connection management
83+
# - rdma-core: Core RDMA user-space libraries
84+
#
85+
# NOTE: mlnx-tools is intentionally NOT included
86+
# - mlnx-tools provides Mellanox-specific diagnostics (mlxlink, mlxconfig, etc.)
87+
# - For containerized ML workloads, standard InfiniBand tools (infiniband-diags) are sufficient
88+
# - Reduces image size and dependency complexity
89+
# - If needed, can be added: mlnx-tools \
90+
#
91+
# CUDA packages (from cuda.repo):
92+
# - cuda-command-line-tools-12-8: CUDA CLI utilities
93+
# - cuda-cudart-devel-12-8: CUDA runtime development headers
94+
# - cuda-nvcc-12-8-12.8.93-1: CUDA compiler (specific version for reproducibility)
95+
#
96+
# Build toolchain (from UBI repos):
97+
# - gcc, gcc-c++, make: C/C++ compilation tools
98+
# - python3-devel: Python headers for building native extensions
99+
# - cmake: Build system (required by some Python packages)
100+
# - git: Version control (some pip installs need it)
101+
#
102+
# --setopt=install_weak_deps=False: Don't install recommended packages (minimizes image size)
103+
RUN dnf install -y --setopt=install_weak_deps=False \
104+
libibverbs-utils \
105+
infiniband-diags \
106+
libibumad \
107+
librdmacm \
108+
librdmacm-utils \
109+
rdma-core \
110+
cuda-command-line-tools-12-8 \
111+
cuda-cudart-devel-12-8 \
112+
cuda-nvcc-12-8-12.8.93-1 \
113+
gcc \
114+
gcc-c++ \
115+
make \
116+
python3-devel \
117+
cmake \
118+
git && dnf clean all && rm -rf /var/cache/dnf/*
119+
120+
# Verify CUDA toolkit
121+
RUN /usr/local/cuda/bin/nvcc -V
122+
RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || \
123+
(echo "[fail-fast] CUDA libs not found" >&2; exit 1)
124+
125+
# Bundle RDMA runtime libs to a staging dir
126+
RUN mkdir -p /opt/rdma-runtime \
127+
&& cp -a /usr/lib64/libibverbs* /opt/rdma-runtime/ || true \
128+
&& cp -a /usr/lib64/librdmacm* /opt/rdma-runtime/ || true \
129+
&& cp -a /usr/lib64/libibumad* /opt/rdma-runtime/ || true \
130+
&& cp -a /usr/lib64/libmlx* /opt/rdma-runtime/ || true \
131+
&& cp -a /usr/lib64/libibnetdisc* /opt/rdma-runtime/ || true
132+
133+
################################################################################
134+
# Python Dependencies Stage
135+
################################################################################
136+
FROM system-deps AS python-deps
137+
138+
USER 0
139+
WORKDIR /tmp/deps
140+
141+
# Copy uv from builder stage (FIPS: uv only used during build, not in runtime)
142+
COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
143+
144+
# Copy pyproject.toml, pylock.toml, and requirements-special.txt
145+
# pylock.toml contains most dependencies
146+
# requirements-special.txt contains packages needing --no-build-isolation
147+
COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
148+
149+
# Switch to user 1001 for pip installations
150+
USER 1001
151+
WORKDIR /opt/app-root/src
152+
153+
# Install main dependencies from pylock.toml using uv pip sync
154+
# This syncs the environment to match exactly what's in the lockfile
155+
#
156+
# UV_NO_CACHE explained:
157+
# What: Sets UV_NO_CACHE=1 temporarily, then unsets it (empty string)
158+
# Why: Running as user 1001 causes uv to try writing to cache directory
159+
# which may have permission issues. Disabling cache avoids this.
160+
# Why unset: ENV changes persist across layers. Unsetting prevents
161+
# the variable from affecting subsequent operations or runtime.
162+
# Empty value effectively unsets the environment variable.
163+
ENV UV_NO_CACHE=1
164+
RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
165+
ENV UV_NO_CACHE=
166+
167+
# Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
168+
# TODO: use aipcc index
169+
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
170+
"git+https://github.com/opendatahub-io/kubeflow-sdk@main"
171+
172+
# Install special packages with proper flags
173+
# These packages require --no-build-isolation to use pre-installed CUDA tools
174+
# and must be installed in a specific order
175+
176+
# Copy requirements-special.txt for installation
177+
COPY --chown=1001:0 requirements-special.txt /tmp/deps/
178+
179+
# 1. Flash Attention (standalone, needs --no-build-isolation --no-deps)
180+
RUN pip install --no-build-isolation --no-cache-dir --no-deps \
181+
$(grep "^flash-attn" /tmp/deps/requirements-special.txt)
182+
183+
# 2. Mamba SSM dependencies (order matters!)
184+
# - causal-conv1d first (needs --no-build-isolation)
185+
# - mamba-ssm second (needs --no-build-isolation --no-deps)
186+
RUN pip install --no-build-isolation --no-cache-dir \
187+
$(grep "^causal-conv1d" /tmp/deps/requirements-special.txt) \
188+
&& pip install --no-build-isolation --no-cache-dir --no-deps \
189+
$(grep "^mamba-ssm" /tmp/deps/requirements-special.txt)
190+
191+
# Fix permissions for OpenShift
192+
# What: Adjusts file permissions for OpenShift/Kubernetes compatibility
193+
# Why: OpenShift runs containers with arbitrary user IDs but fixed group ID (root group)
194+
# - chmod g+w: Allows group write access to site-packages (for pip installs at runtime)
195+
# - fix-permissions: UBI-provided script that ensures group ownership/permissions
196+
# When needed: Required for any container that may run in OpenShift with arbitrary UIDs
197+
ARG PYTHON_VERSION
198+
USER 0
199+
RUN chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages \
200+
&& fix-permissions /opt/app-root -P
201+
202+
# Clean up uv and build artifacts (FIPS: remove build-only tools)
203+
RUN rm -f /usr/local/bin/uv \
204+
&& rm -rf /tmp/deps \
205+
&& dnf remove -y gcc gcc-c++ cmake python3-devel \
206+
&& dnf clean all \
207+
&& rm -rf /var/cache/dnf/*
208+
209+
################################################################################
210+
# Final Stage - FIPS-friendly Runtime
211+
################################################################################
212+
FROM ${BASE_IMAGE} AS final
213+
214+
USER 0
215+
WORKDIR /opt/app-root/src
216+
217+
# Copy Python site-packages and CLI entry points from python-deps stage
218+
# This excludes build tools like gcc, cmake, uv (FIPS friendly)
219+
ARG PYTHON_VERSION
220+
COPY --from=python-deps /opt/app-root/lib/python${PYTHON_VERSION}/site-packages /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
221+
COPY --from=python-deps /opt/app-root/bin /opt/app-root/bin
222+
223+
# Copy CUDA runtime from system-deps (built Python packages need CUDA libs)
224+
# Contains all necessary CUDA libraries - no need to install via dnf
225+
COPY --from=system-deps /usr/local/cuda /usr/local/cuda
226+
227+
# Copy RDMA runtime libraries from system-deps
228+
# These are needed for InfiniBand/RDMA support at runtime
229+
COPY --from=system-deps /opt/rdma-runtime/ /usr/lib64/
230+
231+
# Update dynamic linker cache for CUDA libraries
232+
# What: ldconfig updates the runtime linker's cache of shared libraries
233+
# Why: After copying CUDA libraries to /usr/local/cuda, the system needs to know where to find them
234+
# - Scans directories like /usr/local/cuda/lib64 (defined in /etc/ld.so.conf.d/)
235+
# - Updates /etc/ld.so.cache so programs can locate libcudart.so, libcublas.so, etc.
236+
# When needed: Required after installing/copying shared libraries to non-standard locations
237+
# Test: Run "ldconfig -p | grep cuda" to see if CUDA libs are in the cache
238+
RUN ldconfig
239+
240+
# FIPS-friendly: Remove uv from final image (inherited from base image)
241+
# uv is only needed during build, not at runtime
242+
RUN rm -f /opt/app-root/bin/uv
243+
244+
# Environment variables for NVIDIA and CUDA
245+
ENV NVIDIA_VISIBLE_DEVICES=all \
246+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
247+
CUDA_VERSION=${CUDA_VERSION} \
248+
CUDA_HOME=/usr/local/cuda \
249+
PATH=/usr/local/cuda/bin:$PATH \
250+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
251+
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
252+
XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
253+
254+
# Copy license file
255+
COPY LICENSE.md /licenses/cuda-license.md
256+
257+
# Copy entrypoint
258+
COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
259+
260+
# Fix permissions for OpenShift (final stage)
261+
# What: Ensures proper permissions for OpenShift/Kubernetes arbitrary UIDs
262+
# Why: After copying site-packages from python-deps stage, permissions need adjustment
263+
# - OpenShift assigns random UID but fixed GID (usually 0, root group)
264+
# - Group write permissions allow pip to install packages at runtime
265+
# - fix-permissions ensures all files have correct group ownership
266+
# When: Required in final stage because COPY operations reset permissions
267+
# Context: This is the second time we do this - once after building packages,
268+
# and again after copying them to the final stage
269+
RUN fix-permissions /opt/app-root -P \
270+
&& chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
271+
272+
USER 1001
273+
WORKDIR /opt/app-root/src
274+
275+
ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"]
276+
CMD ["start-notebook.sh"]

0 commit comments

Comments
 (0)