1+ # Universal Image Dockerfile
2+ #
3+ # FIPS-friendly Features:
4+ # - uv is used only in build stage (not shipped in runtime image)
5+ # - Build tools are isolated in intermediate stages
6+ # - Final image contains only runtime dependencies
7+ # - OpenSSL FIPS mode supported via base image
8+
9+ # ###############################################################################
10+ # Build Arguments
11+ # ###############################################################################
12+ ARG BASE_IMAGE=quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
13+ # CUDA_VERSION: Used for environment variables and documentation purposes
14+ # - Sets CUDA_VERSION env var (helps with debugging and tooling)
15+ # - Not used for package installation (specific versions hardcoded below)
16+ ARG CUDA_VERSION=12.8
17+ # PYTHON_VERSION: Critical for path resolution in multi-stage build
18+ # - Used to locate site-packages directory (e.g., /opt/app-root/lib/python3.12/site-packages)
19+ # - Must match base image Python version
20+ ARG PYTHON_VERSION=3.12
21+
22+ # ###############################################################################
23+ # Builder Stage - Install uv for dependency resolution
24+ # ###############################################################################
25+ FROM ${BASE_IMAGE} AS builder
26+
27+ USER 0
28+ WORKDIR /tmp/builder
29+
30+ # Install latest version of uv in builder stage
31+ # Why: Even if base image has uv, we want the latest version for:
32+ # - Latest bug fixes and performance improvements
33+ # - Consistent behavior across builds
34+ # - Specific version control independent of base image
35+ # Note: This uv is isolated in builder stage and copied selectively to other stages
36+ RUN pip install --no-cache-dir uv
37+
38+ # ###############################################################################
39+ # Base Stage
40+ # ###############################################################################
41+ FROM ${BASE_IMAGE} AS base
42+
43+ LABEL name="universal:py312-cuda128-torch290" \
44+ summary="Universal CUDA 12.8 Python 3.12 image with PyTorch 2.9.0" \
45+ description="Universal image combining minimal Jupyter workbench and runtime ML stack (CUDA 12.8, PyTorch 2.9.0, FlashAttention 2.8.3) on UBI9" \
46+ io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
47+ io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided."
48+
49+ # Copy license file
50+ COPY LICENSE.md /licenses/cuda-license.md
51+
52+ USER 0
53+ WORKDIR /opt/app-root/bin
54+
55+ # Environment variables for NVIDIA and CUDA
56+ ENV NVIDIA_VISIBLE_DEVICES=all \
57+ NVIDIA_DRIVER_CAPABILITIES=compute,utility \
58+ CUDA_VERSION=${CUDA_VERSION} \
59+ CUDA_HOME=/usr/local/cuda \
60+ PATH=/usr/local/cuda/bin:$PATH \
61+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
62+ TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
63+ XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
64+
65+ # ###############################################################################
66+ # System Dependencies Stage
67+ # ###############################################################################
68+ FROM base AS system-deps
69+
70+ USER 0
71+ WORKDIR /opt/app-root/bin
72+
73+ # Copy repository configuration files
74+ COPY cuda.repo mellanox.repo /etc/yum.repos.d/
75+
76+ # Install system packages (RDMA, CUDA tools, build toolchain)
77+ # Package list defined in rpms.in.yaml for documentation
78+ #
79+ # RDMA/InfiniBand packages (from mellanox.repo):
80+ # - libibverbs-utils, infiniband-diags: RDMA diagnostics and utilities
81+ # - libibumad: User-space MAD (Management Datagram) library for InfiniBand
82+ # - librdmacm, librdmacm-utils: RDMA connection management
83+ # - rdma-core: Core RDMA user-space libraries
84+ #
85+ # NOTE: mlnx-tools is intentionally NOT included
86+ # - mlnx-tools provides Mellanox-specific diagnostics (mlxlink, mlxconfig, etc.)
87+ # - For containerized ML workloads, standard InfiniBand tools (infiniband-diags) are sufficient
88+ # - Reduces image size and dependency complexity
89+ # - If needed, can be added: mlnx-tools \
90+ #
91+ # CUDA packages (from cuda.repo):
92+ # - cuda-command-line-tools-12-8: CUDA CLI utilities
93+ # - cuda-cudart-devel-12-8: CUDA runtime development headers
94+ # - cuda-nvcc-12-8-12.8.93-1: CUDA compiler (specific version for reproducibility)
95+ #
96+ # Build toolchain (from UBI repos):
97+ # - gcc, gcc-c++, make: C/C++ compilation tools
98+ # - python3-devel: Python headers for building native extensions
99+ # - cmake: Build system (required by some Python packages)
100+ # - git: Version control (some pip installs need it)
101+ #
102+ # --setopt=install_weak_deps=False: Don't install recommended packages (minimizes image size)
103+ RUN dnf install -y --setopt=install_weak_deps=False \
104+ libibverbs-utils \
105+ infiniband-diags \
106+ libibumad \
107+ librdmacm \
108+ librdmacm-utils \
109+ rdma-core \
110+ cuda-command-line-tools-12-8 \
111+ cuda-cudart-devel-12-8 \
112+ cuda-nvcc-12-8-12.8.93-1 \
113+ gcc \
114+ gcc-c++ \
115+ make \
116+ python3-devel \
117+ cmake \
118+ git && dnf clean all && rm -rf /var/cache/dnf/*
119+
120+ # Verify CUDA toolkit
121+ RUN /usr/local/cuda/bin/nvcc -V
122+ RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || \
123+ (echo "[fail-fast] CUDA libs not found" >&2; exit 1)
124+
125+ # Bundle RDMA runtime libs to a staging dir
126+ RUN mkdir -p /opt/rdma-runtime \
127+ && cp -a /usr/lib64/libibverbs* /opt/rdma-runtime/ || true \
128+ && cp -a /usr/lib64/librdmacm* /opt/rdma-runtime/ || true \
129+ && cp -a /usr/lib64/libibumad* /opt/rdma-runtime/ || true \
130+ && cp -a /usr/lib64/libmlx* /opt/rdma-runtime/ || true \
131+ && cp -a /usr/lib64/libibnetdisc* /opt/rdma-runtime/ || true
132+
133+ # ###############################################################################
134+ # Python Dependencies Stage
135+ # ###############################################################################
136+ FROM system-deps AS python-deps
137+
138+ USER 0
139+ WORKDIR /tmp/deps
140+
141+ # Copy uv from builder stage (FIPS: uv only used during build, not in runtime)
142+ COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
143+
144+ # Copy pyproject.toml, pylock.toml, and requirements-special.txt
145+ # pylock.toml contains most dependencies
146+ # requirements-special.txt contains packages needing --no-build-isolation
147+ COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
148+
149+ # Switch to user 1001 for pip installations
150+ USER 1001
151+ WORKDIR /opt/app-root/src
152+
153+ # Install main dependencies from pylock.toml using uv pip sync
154+ # This syncs the environment to match exactly what's in the lockfile
155+ #
156+ # UV_NO_CACHE explained:
157+ # What: Sets UV_NO_CACHE=1 temporarily, then unsets it (empty string)
158+ # Why: Running as user 1001 causes uv to try writing to cache directory
159+ # which may have permission issues. Disabling cache avoids this.
160+ # Why unset: ENV changes persist across layers. Unsetting prevents
161+ # the variable from affecting subsequent operations or runtime.
162+ # Empty value effectively unsets the environment variable.
163+ ENV UV_NO_CACHE=1
164+ RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
165+ ENV UV_NO_CACHE=
166+
167+ # Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
168+ # TODO: use aipcc index
169+ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
170+ "git+https://github.com/opendatahub-io/kubeflow-sdk@main"
171+
172+ # Install special packages with proper flags
173+ # These packages require --no-build-isolation to use pre-installed CUDA tools
174+ # and must be installed in a specific order
175+
176+ # Copy requirements-special.txt for installation
177+ COPY --chown=1001:0 requirements-special.txt /tmp/deps/
178+
179+ # 1. Flash Attention (standalone, needs --no-build-isolation --no-deps)
180+ RUN pip install --no-build-isolation --no-cache-dir --no-deps \
181+ $(grep "^flash-attn" /tmp/deps/requirements-special.txt)
182+
183+ # 2. Mamba SSM dependencies (order matters!)
184+ # - causal-conv1d first (needs --no-build-isolation)
185+ # - mamba-ssm second (needs --no-build-isolation --no-deps)
186+ RUN pip install --no-build-isolation --no-cache-dir \
187+ $(grep "^causal-conv1d" /tmp/deps/requirements-special.txt) \
188+ && pip install --no-build-isolation --no-cache-dir --no-deps \
189+ $(grep "^mamba-ssm" /tmp/deps/requirements-special.txt)
190+
191+ # Fix permissions for OpenShift
192+ # What: Adjusts file permissions for OpenShift/Kubernetes compatibility
193+ # Why: OpenShift runs containers with arbitrary user IDs but fixed group ID (root group)
194+ # - chmod g+w: Allows group write access to site-packages (for pip installs at runtime)
195+ # - fix-permissions: UBI-provided script that ensures group ownership/permissions
196+ # When needed: Required for any container that may run in OpenShift with arbitrary UIDs
197+ ARG PYTHON_VERSION
198+ USER 0
199+ RUN chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages \
200+ && fix-permissions /opt/app-root -P
201+
202+ # Clean up uv and build artifacts (FIPS: remove build-only tools)
203+ RUN rm -f /usr/local/bin/uv \
204+ && rm -rf /tmp/deps \
205+ && dnf remove -y gcc gcc-c++ cmake python3-devel \
206+ && dnf clean all \
207+ && rm -rf /var/cache/dnf/*
208+
209+ # ###############################################################################
210+ # Final Stage - FIPS-friendly Runtime
211+ # ###############################################################################
212+ FROM ${BASE_IMAGE} AS final
213+
214+ USER 0
215+ WORKDIR /opt/app-root/src
216+
217+ # Copy Python site-packages and CLI entry points from python-deps stage
218+ # This excludes build tools like gcc, cmake, uv (FIPS friendly)
219+ ARG PYTHON_VERSION
220+ COPY --from=python-deps /opt/app-root/lib/python${PYTHON_VERSION}/site-packages /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
221+ COPY --from=python-deps /opt/app-root/bin /opt/app-root/bin
222+
223+ # Copy CUDA runtime from system-deps (built Python packages need CUDA libs)
224+ # Contains all necessary CUDA libraries - no need to install via dnf
225+ COPY --from=system-deps /usr/local/cuda /usr/local/cuda
226+
227+ # Copy RDMA runtime libraries from system-deps
228+ # These are needed for InfiniBand/RDMA support at runtime
229+ COPY --from=system-deps /opt/rdma-runtime/ /usr/lib64/
230+
231+ # Update dynamic linker cache for CUDA libraries
232+ # What: ldconfig updates the runtime linker's cache of shared libraries
233+ # Why: After copying CUDA libraries to /usr/local/cuda, the system needs to know where to find them
234+ # - Scans directories like /usr/local/cuda/lib64 (defined in /etc/ld.so.conf.d/)
235+ # - Updates /etc/ld.so.cache so programs can locate libcudart.so, libcublas.so, etc.
236+ # When needed: Required after installing/copying shared libraries to non-standard locations
237+ # Test: Run "ldconfig -p | grep cuda" to see if CUDA libs are in the cache
238+ RUN ldconfig
239+
240+ # FIPS-friendly: Remove uv from final image (inherited from base image)
241+ # uv is only needed during build, not at runtime
242+ RUN rm -f /opt/app-root/bin/uv
243+
244+ # Environment variables for NVIDIA and CUDA
245+ ENV NVIDIA_VISIBLE_DEVICES=all \
246+ NVIDIA_DRIVER_CAPABILITIES=compute,utility \
247+ CUDA_VERSION=${CUDA_VERSION} \
248+ CUDA_HOME=/usr/local/cuda \
249+ PATH=/usr/local/cuda/bin:$PATH \
250+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
251+ TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
252+ XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
253+
254+ # Copy license file
255+ COPY LICENSE.md /licenses/cuda-license.md
256+
257+ # Copy entrypoint
258+ COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
259+
260+ # Fix permissions for OpenShift (final stage)
261+ # What: Ensures proper permissions for OpenShift/Kubernetes arbitrary UIDs
262+ # Why: After copying site-packages from python-deps stage, permissions need adjustment
263+ # - OpenShift assigns random UID but fixed GID (usually 0, root group)
264+ # - Group write permissions allow pip to install packages at runtime
265+ # - fix-permissions ensures all files have correct group ownership
266+ # When: Required in final stage because COPY operations reset permissions
267+ # Context: This is the second time we do this - once after building packages,
268+ # and again after copying them to the final stage
269+ RUN fix-permissions /opt/app-root -P \
270+ && chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
271+
272+ USER 1001
273+ WORKDIR /opt/app-root/src
274+
275+ ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh" ]
276+ CMD ["start-notebook.sh" ]
0 commit comments