|
| 1 | +# ROCm Image Dockerfile |
| 2 | +# |
| 3 | +# FIPS-friendly Features: |
| 4 | +# - uv is used only in build stage (not shipped in runtime image) |
| 5 | +# - Build tools are isolated in intermediate stages |
| 6 | +# - Final image contains only runtime dependencies |
| 7 | + |
| 8 | +################################################################################ |
| 9 | +# Build Arguments |
| 10 | +################################################################################ |
| 11 | +ARG BASE_IMAGE=quay.io/opendatahub/odh-workbench-jupyter-minimal-rocm-py312-ubi9:2025b-v1.39 |
| 12 | +ARG PYTHON_VERSION=3.12 |
| 13 | + |
| 14 | +################################################################################ |
| 15 | +# Builder Stage - Install uv for dependency resolution |
| 16 | +################################################################################ |
| 17 | +FROM ${BASE_IMAGE} AS builder |
| 18 | + |
| 19 | +USER 0 |
| 20 | +WORKDIR /tmp/builder |
| 21 | + |
| 22 | +# Install latest version of uv in builder stage |
| 23 | +RUN pip install --no-cache-dir uv |
| 24 | + |
| 25 | +################################################################################ |
| 26 | +# Base Stage |
| 27 | +################################################################################ |
| 28 | +FROM ${BASE_IMAGE} AS base |
| 29 | + |
| 30 | +LABEL name="rocm:py312-rocm64-torch280" \ |
| 31 | + summary="ROCm 6.4 Python 3.12 image with PyTorch 2.8.0" \ |
| 32 | + description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.8.0) on UBI9" \ |
| 33 | + io.k8s.display-name="ROCm 6.4 Python 3.12 (Workbench + Runtime)" \ |
| 34 | + io.k8s.description="ROCm image: Jupyter workbench by default; runtime when command provided." |
| 35 | + |
| 36 | +# Copy license file |
| 37 | +COPY LICENSE.md /licenses/rocm-license.md |
| 38 | + |
| 39 | +USER 0 |
| 40 | +WORKDIR /opt/app-root/bin |
| 41 | + |
| 42 | +# Environment variables for ROCm |
| 43 | +ENV ROCM_HOME=/opt/rocm \ |
| 44 | + PATH=/opt/rocm/bin:$PATH \ |
| 45 | + LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH |
| 46 | + |
| 47 | +################################################################################ |
| 48 | +# System Dependencies Stage |
| 49 | +################################################################################ |
| 50 | +FROM base AS system-deps |
| 51 | + |
| 52 | +USER 0 |
| 53 | +WORKDIR /opt/app-root/bin |
| 54 | + |
| 55 | +# Copy repository configuration files |
| 56 | +COPY mellanox.repo rocm.repo /etc/yum.repos.d/ |
| 57 | + |
| 58 | +# Install ROCm development tools |
| 59 | +# Using individual packages instead of metapackages to avoid python3-wheel dependency issue |
| 60 | +# hipcc is the HIP compiler needed for flash-attention build |
| 61 | +# rocm-device-libs provides the GPU device library required by clang for ROCm compilation |
| 62 | +RUN dnf install -y --setopt=install_weak_deps=False \ |
| 63 | + hipcc \ |
| 64 | + hip-devel \ |
| 65 | + hip-runtime-amd \ |
| 66 | + rocm-cmake \ |
| 67 | + rocm-device-libs \ |
| 68 | + rocblas-devel \ |
| 69 | + hipblas-devel \ |
| 70 | + rocsolver-devel \ |
| 71 | + hipsolver-devel && \ |
| 72 | + dnf clean all && rm -rf /var/cache/dnf/* && \ |
| 73 | + # hipcc installs to /opt/rocm-X.Y.Z/bin but we need /opt/rocm/bin/hipcc |
| 74 | + ln -sf /opt/rocm-*/bin/hipcc /opt/rocm/bin/hipcc |
| 75 | + |
| 76 | +# Install system packages (RDMA and build toolchain) |
| 77 | +# |
| 78 | +# RDMA/InfiniBand packages (from mellanox.repo): |
| 79 | +# - libibverbs-utils, infiniband-diags: RDMA diagnostics and utilities |
| 80 | +# - libibumad: User-space MAD (Management Datagram) library for InfiniBand |
| 81 | +# - librdmacm, librdmacm-utils: RDMA connection management |
| 82 | +# - rdma-core: Core RDMA user-space libraries |
| 83 | +# |
| 84 | +# Build toolchain (from UBI repos): |
| 85 | +# - gcc, gcc-c++, make: C/C++ compilation tools |
| 86 | +# - python3-devel: Python headers for building native extensions |
| 87 | +# - cmake: Build system (required by some Python packages) |
| 88 | +# - git: Version control (some pip installs need it) |
| 89 | +RUN dnf install -y --setopt=install_weak_deps=False \ |
| 90 | + libibverbs-utils \ |
| 91 | + infiniband-diags \ |
| 92 | + libibumad \ |
| 93 | + librdmacm \ |
| 94 | + librdmacm-utils \ |
| 95 | + rdma-core \ |
| 96 | + gcc \ |
| 97 | + gcc-c++ \ |
| 98 | + make \ |
| 99 | + python3-devel \ |
| 100 | + cmake \ |
| 101 | + git && dnf clean all && rm -rf /var/cache/dnf/* |
| 102 | + |
| 103 | +# Bundle RDMA runtime libs to a staging dir |
| 104 | +RUN mkdir -p /opt/rdma-runtime \ |
| 105 | + && cp -a /usr/lib64/libibverbs* /opt/rdma-runtime/ || true \ |
| 106 | + && cp -a /usr/lib64/librdmacm* /opt/rdma-runtime/ || true \ |
| 107 | + && cp -a /usr/lib64/libibumad* /opt/rdma-runtime/ || true \ |
| 108 | + && cp -a /usr/lib64/libmlx* /opt/rdma-runtime/ || true \ |
| 109 | + && cp -a /usr/lib64/libibnetdisc* /opt/rdma-runtime/ || true |
| 110 | + |
| 111 | +################################################################################ |
| 112 | +# Python Dependencies Stage |
| 113 | +################################################################################ |
| 114 | +FROM system-deps AS python-deps |
| 115 | + |
| 116 | +USER 0 |
| 117 | +WORKDIR /tmp/deps |
| 118 | + |
| 119 | +# Copy uv from builder stage (FIPS: uv only used during build, not in runtime) |
| 120 | +COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv |
| 121 | + |
| 122 | +# Copy dependency files |
| 123 | +# pylock.toml: All dependencies including ROCm PyTorch (compiled with --find-links) |
| 124 | +COPY --chown=1001:0 pyproject.toml pylock.toml ./ |
| 125 | + |
| 126 | +# Switch to user 1001 for pip installations |
| 127 | +USER 1001 |
| 128 | +WORKDIR /opt/app-root/src |
| 129 | + |
| 130 | +# Install main dependencies from pylock.toml using uv pip sync |
| 131 | +# This syncs the environment to match exactly what's in the lockfile |
| 132 | +# pylock.toml was compiled with --find-links=https://download.pytorch.org/whl/rocm6.4 |
| 133 | +# so torch comes from ROCm index |
| 134 | +ENV UV_NO_CACHE=1 |
| 135 | +RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml |
| 136 | +ENV UV_NO_CACHE= |
| 137 | + |
| 138 | +# Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt) |
| 139 | +# TODO: use aipcc index |
| 140 | +RUN pip install --retries 5 --timeout 300 --no-cache-dir \ |
| 141 | + "git+https://github.com/opendatahub-io/kubeflow-sdk@main" |
| 142 | + |
| 143 | +# TODO: Re-enable Flash Attention after confirming base image works |
| 144 | +# Install Flash Attention from original Dao-AILab repo |
| 145 | +# --no-build-isolation: Use already-installed torch instead of isolated env |
| 146 | +# USER 0 |
| 147 | +# ENV GPU_ARCHS="gfx90a;gfx942" |
| 148 | +# RUN cd /tmp \ |
| 149 | +# && git clone --depth 1 --branch v2.8.3 https://github.com/Dao-AILab/flash-attention.git \ |
| 150 | +# && cd flash-attention \ |
| 151 | +# && git submodule update --init \ |
| 152 | +# && MAX_JOBS="16" pip install --no-build-isolation --no-cache-dir --no-deps . \ |
| 153 | +# && cd / && rm -rf /tmp/flash-attention |
| 154 | + |
| 155 | + |
| 156 | +# Fix permissions for OpenShift |
| 157 | +ARG PYTHON_VERSION |
| 158 | +USER 0 |
| 159 | +RUN chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages \ |
| 160 | + && fix-permissions /opt/app-root -P |
| 161 | + |
| 162 | +# Clean up uv and build artifacts |
| 163 | +RUN rm -f /usr/local/bin/uv \ |
| 164 | + && rm -rf /tmp/deps \ |
| 165 | + && dnf remove -y gcc gcc-c++ cmake python3-devel \ |
| 166 | + && dnf clean all \ |
| 167 | + && rm -rf /var/cache/dnf/* |
| 168 | + |
| 169 | +################################################################################ |
| 170 | +# Final Stage - FIPS-friendly Runtime |
| 171 | +################################################################################ |
| 172 | +FROM ${BASE_IMAGE} AS final |
| 173 | + |
| 174 | +USER 0 |
| 175 | +WORKDIR /opt/app-root/src |
| 176 | + |
| 177 | +# Copy Python site-packages and CLI entry points from python-deps stage |
| 178 | +ARG PYTHON_VERSION |
| 179 | +COPY --from=python-deps /opt/app-root/lib/python${PYTHON_VERSION}/site-packages /opt/app-root/lib/python${PYTHON_VERSION}/site-packages |
| 180 | +COPY --from=python-deps /opt/app-root/bin /opt/app-root/bin |
| 181 | + |
| 182 | +# Copy RDMA runtime libraries from system-deps |
| 183 | +# These are needed for InfiniBand/RDMA support at runtime |
| 184 | +COPY --from=system-deps /opt/rdma-runtime/ /usr/lib64/ |
| 185 | + |
| 186 | +# Update dynamic linker cache |
| 187 | +RUN ldconfig |
| 188 | + |
| 189 | +# FIPS-friendly: Remove uv from final image |
| 190 | +RUN rm -f /opt/app-root/bin/uv |
| 191 | + |
| 192 | +# Environment variables for ROCm |
| 193 | +ENV ROCM_HOME=/opt/rocm \ |
| 194 | + PATH=/opt/rocm/bin:$PATH \ |
| 195 | + LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH |
| 196 | + |
| 197 | +# Copy license file |
| 198 | +COPY LICENSE.md /licenses/rocm-license.md |
| 199 | + |
| 200 | +# Copy entrypoint |
| 201 | +COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh |
| 202 | + |
| 203 | +# Fix permissions for OpenShift (final stage) |
| 204 | +RUN fix-permissions /opt/app-root -P \ |
| 205 | + && chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages |
| 206 | + |
| 207 | +USER 1001 |
| 208 | +WORKDIR /opt/app-root/src |
| 209 | + |
| 210 | +ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"] |
| 211 | +CMD ["start-notebook.sh"] |
0 commit comments