Skip to content

Commit d200340

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 80b4713 + 9c68578 commit d200340

File tree

8 files changed

+3462
-0
lines changed

8 files changed

+3462
-0
lines changed
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
# ROCm Image Dockerfile
2+
#
3+
# FIPS-friendly Features:
4+
# - uv is used only in build stage (not shipped in runtime image)
5+
# - Build tools are isolated in intermediate stages
6+
# - Final image contains only runtime dependencies
7+
8+
################################################################################
9+
# Build Arguments
10+
################################################################################
11+
ARG BASE_IMAGE=quay.io/opendatahub/odh-workbench-jupyter-minimal-rocm-py312-ubi9:2025b-v1.39
12+
ARG PYTHON_VERSION=3.12
13+
14+
################################################################################
15+
# Builder Stage - Install uv for dependency resolution
16+
################################################################################
17+
FROM ${BASE_IMAGE} AS builder
18+
19+
USER 0
20+
WORKDIR /tmp/builder
21+
22+
# Install latest version of uv in builder stage
23+
RUN pip install --no-cache-dir uv
24+
25+
################################################################################
26+
# Base Stage
27+
################################################################################
28+
FROM ${BASE_IMAGE} AS base
29+
30+
LABEL name="rocm:py312-rocm64-torch280" \
31+
summary="ROCm 6.4 Python 3.12 image with PyTorch 2.8.0" \
32+
description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.8.0) on UBI9" \
33+
io.k8s.display-name="ROCm 6.4 Python 3.12 (Workbench + Runtime)" \
34+
io.k8s.description="ROCm image: Jupyter workbench by default; runtime when command provided."
35+
36+
# Copy license file
37+
COPY LICENSE.md /licenses/rocm-license.md
38+
39+
USER 0
40+
WORKDIR /opt/app-root/bin
41+
42+
# Environment variables for ROCm
43+
ENV ROCM_HOME=/opt/rocm \
44+
PATH=/opt/rocm/bin:$PATH \
45+
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
46+
47+
################################################################################
48+
# System Dependencies Stage
49+
################################################################################
50+
FROM base AS system-deps
51+
52+
USER 0
53+
WORKDIR /opt/app-root/bin
54+
55+
# Copy repository configuration files
56+
COPY mellanox.repo rocm.repo /etc/yum.repos.d/
57+
58+
# Install ROCm development tools
59+
# Using individual packages instead of metapackages to avoid python3-wheel dependency issue
60+
# hipcc is the HIP compiler needed for flash-attention build
61+
# rocm-device-libs provides the GPU device library required by clang for ROCm compilation
62+
RUN dnf install -y --setopt=install_weak_deps=False \
63+
hipcc \
64+
hip-devel \
65+
hip-runtime-amd \
66+
rocm-cmake \
67+
rocm-device-libs \
68+
rocblas-devel \
69+
hipblas-devel \
70+
rocsolver-devel \
71+
hipsolver-devel && \
72+
dnf clean all && rm -rf /var/cache/dnf/* && \
73+
# hipcc installs to /opt/rocm-X.Y.Z/bin but we need /opt/rocm/bin/hipcc
74+
ln -sf /opt/rocm-*/bin/hipcc /opt/rocm/bin/hipcc
75+
76+
# Install system packages (RDMA and build toolchain)
77+
#
78+
# RDMA/InfiniBand packages (from mellanox.repo):
79+
# - libibverbs-utils, infiniband-diags: RDMA diagnostics and utilities
80+
# - libibumad: User-space MAD (Management Datagram) library for InfiniBand
81+
# - librdmacm, librdmacm-utils: RDMA connection management
82+
# - rdma-core: Core RDMA user-space libraries
83+
#
84+
# Build toolchain (from UBI repos):
85+
# - gcc, gcc-c++, make: C/C++ compilation tools
86+
# - python3-devel: Python headers for building native extensions
87+
# - cmake: Build system (required by some Python packages)
88+
# - git: Version control (some pip installs need it)
89+
RUN dnf install -y --setopt=install_weak_deps=False \
90+
libibverbs-utils \
91+
infiniband-diags \
92+
libibumad \
93+
librdmacm \
94+
librdmacm-utils \
95+
rdma-core \
96+
gcc \
97+
gcc-c++ \
98+
make \
99+
python3-devel \
100+
cmake \
101+
git && dnf clean all && rm -rf /var/cache/dnf/*
102+
103+
# Bundle RDMA runtime libs to a staging dir
104+
RUN mkdir -p /opt/rdma-runtime \
105+
&& cp -a /usr/lib64/libibverbs* /opt/rdma-runtime/ || true \
106+
&& cp -a /usr/lib64/librdmacm* /opt/rdma-runtime/ || true \
107+
&& cp -a /usr/lib64/libibumad* /opt/rdma-runtime/ || true \
108+
&& cp -a /usr/lib64/libmlx* /opt/rdma-runtime/ || true \
109+
&& cp -a /usr/lib64/libibnetdisc* /opt/rdma-runtime/ || true
110+
111+
################################################################################
112+
# Python Dependencies Stage
113+
################################################################################
114+
FROM system-deps AS python-deps
115+
116+
USER 0
117+
WORKDIR /tmp/deps
118+
119+
# Copy uv from builder stage (FIPS: uv only used during build, not in runtime)
120+
COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
121+
122+
# Copy dependency files
123+
# pylock.toml: All dependencies including ROCm PyTorch (compiled with --find-links)
124+
COPY --chown=1001:0 pyproject.toml pylock.toml ./
125+
126+
# Switch to user 1001 for pip installations
127+
USER 1001
128+
WORKDIR /opt/app-root/src
129+
130+
# Install main dependencies from pylock.toml using uv pip sync
131+
# This syncs the environment to match exactly what's in the lockfile
132+
# pylock.toml was compiled with --find-links=https://download.pytorch.org/whl/rocm6.4
133+
# so torch comes from ROCm index
134+
ENV UV_NO_CACHE=1
135+
RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
136+
ENV UV_NO_CACHE=
137+
138+
# Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
139+
# TODO: use aipcc index
140+
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
141+
"git+https://github.com/opendatahub-io/kubeflow-sdk@main"
142+
143+
# TODO: Re-enable Flash Attention after confirming base image works
144+
# Install Flash Attention from original Dao-AILab repo
145+
# --no-build-isolation: Use already-installed torch instead of isolated env
146+
# USER 0
147+
# ENV GPU_ARCHS="gfx90a;gfx942"
148+
# RUN cd /tmp \
149+
# && git clone --depth 1 --branch v2.8.3 https://github.com/Dao-AILab/flash-attention.git \
150+
# && cd flash-attention \
151+
# && git submodule update --init \
152+
# && MAX_JOBS="16" pip install --no-build-isolation --no-cache-dir --no-deps . \
153+
# && cd / && rm -rf /tmp/flash-attention
154+
155+
156+
# Fix permissions for OpenShift
157+
ARG PYTHON_VERSION
158+
USER 0
159+
RUN chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages \
160+
&& fix-permissions /opt/app-root -P
161+
162+
# Clean up uv and build artifacts
163+
RUN rm -f /usr/local/bin/uv \
164+
&& rm -rf /tmp/deps \
165+
&& dnf remove -y gcc gcc-c++ cmake python3-devel \
166+
&& dnf clean all \
167+
&& rm -rf /var/cache/dnf/*
168+
169+
################################################################################
170+
# Final Stage - FIPS-friendly Runtime
171+
################################################################################
172+
FROM ${BASE_IMAGE} AS final
173+
174+
USER 0
175+
WORKDIR /opt/app-root/src
176+
177+
# Copy Python site-packages and CLI entry points from python-deps stage
178+
ARG PYTHON_VERSION
179+
COPY --from=python-deps /opt/app-root/lib/python${PYTHON_VERSION}/site-packages /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
180+
COPY --from=python-deps /opt/app-root/bin /opt/app-root/bin
181+
182+
# Copy RDMA runtime libraries from system-deps
183+
# These are needed for InfiniBand/RDMA support at runtime
184+
COPY --from=system-deps /opt/rdma-runtime/ /usr/lib64/
185+
186+
# Update dynamic linker cache
187+
RUN ldconfig
188+
189+
# FIPS-friendly: Remove uv from final image
190+
RUN rm -f /opt/app-root/bin/uv
191+
192+
# Environment variables for ROCm
193+
ENV ROCM_HOME=/opt/rocm \
194+
PATH=/opt/rocm/bin:$PATH \
195+
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
196+
197+
# Copy license file
198+
COPY LICENSE.md /licenses/rocm-license.md
199+
200+
# Copy entrypoint
201+
COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
202+
203+
# Fix permissions for OpenShift (final stage)
204+
RUN fix-permissions /opt/app-root -P \
205+
&& chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
206+
207+
USER 1001
208+
WORKDIR /opt/app-root/src
209+
210+
ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"]
211+
CMD ["start-notebook.sh"]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2023 - 2025 Advanced Micro Devices, Inc. All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env sh
2+
set -e
3+
# Universal entrypoint
4+
# Behavior:
5+
# - If NOTEBOOK_ARGS is set, start the workbench notebook with those args (if any).
6+
# - Otherwise, run the provided command. If no command provided, exit with help.
7+
8+
# If NOTEBOOK_ARGS is present (even if empty), run start-notebook.sh
9+
if [ -n "${NOTEBOOK_ARGS+x}" ]; then
10+
if [ -n "${NOTEBOOK_ARGS}" ]; then
11+
# Use a login shell to correctly parse NOTEBOOK_ARGS word splitting and quotes
12+
exec sh -lc "exec start-notebook.sh ${NOTEBOOK_ARGS}"
13+
else
14+
exec start-notebook.sh
15+
fi
16+
fi
17+
18+
# Otherwise, run provided command, or error if none
19+
if [ "$#" -gt 0 ]; then
20+
exec "$@"
21+
else
22+
echo "No NOTEBOOK_ARGS set and no command provided. Either set NOTEBOOK_ARGS to run start-notebook.sh, or provide a command, e.g.: python -m your.module" >&2
23+
exit 2
24+
fi
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[mlnx_ofed_24.10-1.1.4.0_base]
2+
name=Mellanox OFED Repository 24.10-1.1.4.0
3+
baseurl=https://linux.mellanox.com/public/repo/mlnx_ofed/24.10-1.1.4.0/rhel9.5/x86_64
4+
enabled=1
5+
gpgcheck=0
6+

0 commit comments

Comments
 (0)