@@ -27,9 +27,9 @@ RUN pip install --no-cache-dir uv
2727# ###############################################################################
2828FROM ${BASE_IMAGE} AS base
2929
30- LABEL name="rocm:py312-rocm64-torch280 " \
31- summary="ROCm 6.4 Python 3.12 image with PyTorch 2.8 .0" \
32- description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.8 .0) on UBI9" \
30+ LABEL name="rocm:py312-rocm64-torch290 " \
31+ summary="ROCm 6.4 Python 3.12 image with PyTorch 2.9 .0" \
32+ description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.9 .0) on UBI9" \
3333 io.k8s.display-name="ROCm 6.4 Python 3.12 (Workbench + Runtime)" \
3434 io.k8s.description="ROCm image: Jupyter workbench by default; runtime when command provided."
3535
@@ -39,10 +39,13 @@ COPY LICENSE.md /licenses/rocm-license.md
3939USER 0
4040WORKDIR /opt/app-root/bin
4141
42- # Environment variables for ROCm
42+ # Environment variables for ROCm (full paths for HIP/ROCm toolchain)
4343ENV ROCM_HOME=/opt/rocm \
44- PATH=/opt/rocm/bin:$PATH \
45- LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
44+ ROCM_PATH=/opt/rocm \
45+ HIP_PATH=/opt/rocm \
46+ PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
47+ LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH \
48+ CMAKE_PREFIX_PATH=/opt/rocm
4649
4750# ###############################################################################
4851# System Dependencies Stage
@@ -57,21 +60,49 @@ COPY mellanox.repo rocm.repo /etc/yum.repos.d/
5760
5861# Install ROCm development tools
5962# Using individual packages instead of metapackages to avoid python3-wheel dependency issue
60- # hipcc is the HIP compiler needed for flash-attention build
61- # rocm-device-libs provides the GPU device library required by clang for ROCm compilation
63+ # - rocm-llvm: LLVM compiler required by hipcc (provides /opt/rocm/llvm/bin/clang++)
64+ # - hipcc: HIP compiler wrapper
65+ # - hip-devel: HIP development headers
66+ # - rocm-device-libs: GPU device library required by clang for ROCm compilation
6267RUN dnf install -y --setopt=install_weak_deps=False \
68+ rocm-llvm \
6369 hipcc \
6470 hip-devel \
6571 hip-runtime-amd \
72+ rocthrust \
73+ hipsparse-devel \
74+ hipsparse \
75+ hipcub-devel \
76+ rocprim-devel \
77+ hipblaslt-devel \
78+ rocrand \
79+ hipfft \
80+ rocfft \
6681 rocm-cmake \
6782 rocm-device-libs \
6883 rocblas-devel \
6984 hipblas-devel \
7085 rocsolver-devel \
7186 hipsolver-devel && \
72- dnf clean all && rm -rf /var/cache/dnf/* && \
73- # hipcc installs to /opt/rocm-X.Y.Z/bin but we need /opt/rocm/bin/hipcc
74- ln -sf /opt/rocm-*/bin/hipcc /opt/rocm/bin/hipcc
87+ dnf clean all && rm -rf /var/cache/dnf/*
88+
89+ # Fix /opt/rocm symlink - base image has it pointing to /etc/alternatives/rocm
90+ # which doesn't contain the full ROCm installation. We need it to point to /opt/rocm-6.4.3
91+ RUN echo "=== Fixing ROCm symlink ===" && \
92+ echo "Current /opt/rocm points to:" && readlink /opt/rocm && \
93+ rm -f /opt/rocm && \
94+ ln -sf /opt/rocm-6.4.3 /opt/rocm && \
95+ echo "Fixed /opt/rocm now points to:" && readlink /opt/rocm && \
96+ ls -la /opt/rocm/ && \
97+ echo "=== ROCm symlink fixed ==="
98+
99+ # Verify ROCm/HIP toolchain is properly installed
100+ RUN echo "=== Verifying ROCm/HIP installation ===" && \
101+ echo "hipcc:" && ls -la /opt/rocm/bin/hipcc && \
102+ echo "clang++:" && ls -la /opt/rocm/lib/llvm/bin/clang++ && \
103+ echo "Testing hipcc:" && /opt/rocm/bin/hipcc --version && \
104+ echo "ROCm device libs:" && ls /opt/rocm/amdgcn/bitcode/ | head -5 && \
105+ echo "=== ROCm verification complete ==="
75106
76107# Install system packages (RDMA and build toolchain)
77108#
@@ -121,7 +152,8 @@ COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
121152
122153# Copy dependency files
123154# pylock.toml: All dependencies including ROCm PyTorch (compiled with --find-links)
124- COPY --chown=1001:0 pyproject.toml pylock.toml ./
155+ # requirements-special.txt: Packages needing --no-build-isolation (flash-attn)
156+ COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
125157
126158# Switch to user 1001 for pip installations
127159USER 1001
@@ -140,18 +172,46 @@ ENV UV_NO_CACHE=
140172RUN pip install --retries 5 --timeout 300 --no-cache-dir \
141173 "git+https://github.com/opendatahub-io/kubeflow-sdk@main"
142174
143- # TODO: Re-enable Flash Attention after confirming base image works
144- # Install Flash Attention from original Dao-AILab repo
145- # --no-build-isolation: Use already-installed torch instead of isolated env
146- # USER 0
147- # ENV GPU_ARCHS="gfx90a;gfx942"
148- # RUN cd /tmp \
149- # && git clone --depth 1 --branch v2.8.3 https://github.com/Dao-AILab/flash-attention.git \
150- # && cd flash-attention \
151- # && git submodule update --init \
152- # && MAX_JOBS="16" pip install --no-build-isolation --no-cache-dir --no-deps . \
153- # && cd / && rm -rf /tmp/flash-attention
154-
175+ # Install flash-attn from requirements-special.txt
176+ # Requires:
177+ # - GPU_ARCHS: tells flash-attn which ROCm architectures to build for (no GPU needed at build time)
178+ # - PYTORCH_ROCM_ARCH: additional hint for PyTorch/ROCm
179+ # - MAX_JOBS/CMAKE_BUILD_PARALLEL_LEVEL: parallel kernel compilation (can be overridden via build-args)
180+ # - --no-build-isolation: use pre-installed torch for the build
181+ # - --no-deps: flash-attn deps already satisfied by pylock.toml
182+
183+ # Accept build args for parallelism (can be overridden by argfile.konflux.conf)
184+ ARG MAX_JOBS=16
185+ ARG CMAKE_BUILD_PARALLEL_LEVEL=8
186+
187+ # Set environment for flash-attn build
188+ ENV GPU_ARCHS="gfx90a;gfx942" \
189+ PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
190+ MAX_JOBS=${MAX_JOBS} \
191+ CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}
192+
193+ # Verify ROCm tools are accessible before building flash-attn
194+ # This runs in python-deps stage to ensure symlinks from system-deps are inherited
195+ RUN echo "=== Pre-build verification in python-deps stage ===" && \
196+ echo "Checking /opt/rocm/bin/hipcc:" && \
197+ ls -la /opt/rocm/bin/hipcc && \
198+ echo "Checking symlink target exists:" && \
199+ readlink -f /opt/rocm/bin/hipcc && \
200+ ls -la $(readlink -f /opt/rocm/bin/hipcc) && \
201+ echo "Testing hipcc execution:" && \
202+ /opt/rocm/bin/hipcc --version && \
203+ echo "=== Pre-build verification passed ==="
204+
205+ # Build flash-attn with verbose output to capture any errors
206+ RUN echo "=== Starting flash-attn build ===" && \
207+ echo "MAX_JOBS=${MAX_JOBS}" && \
208+ echo "CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}" && \
209+ echo "GPU_ARCHS=${GPU_ARCHS}" && \
210+ echo "ROCM_HOME=${ROCM_HOME}" && \
211+ echo "HIP_PATH=${HIP_PATH}" && \
212+ pip install --no-build-isolation --no-cache-dir --no-deps --verbose \
213+ $(grep "^flash-attn" /tmp/deps/requirements-special.txt) 2>&1 | tee /tmp/flash-attn-build.log && \
214+ echo "=== flash-attn build complete ==="
155215
156216# Fix permissions for OpenShift
157217ARG PYTHON_VERSION
@@ -189,10 +249,12 @@ RUN ldconfig
189249# FIPS-friendly: Remove uv from final image
190250RUN rm -f /opt/app-root/bin/uv
191251
192- # Environment variables for ROCm
252+ # Environment variables for ROCm (full paths for HIP/ROCm toolchain)
193253ENV ROCM_HOME=/opt/rocm \
194- PATH=/opt/rocm/bin:$PATH \
195- LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
254+ ROCM_PATH=/opt/rocm \
255+ HIP_PATH=/opt/rocm \
256+ PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
257+ LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH
196258
197259# Copy license file
198260COPY LICENSE.md /licenses/rocm-license.md
0 commit comments