@@ -39,10 +39,13 @@ COPY LICENSE.md /licenses/rocm-license.md
3939USER 0
4040WORKDIR /opt/app-root/bin
4141
42- # Environment variables for ROCm
42+ # Environment variables for ROCm (full paths for HIP/ROCm toolchain)
4343ENV ROCM_HOME=/opt/rocm \
44- PATH=/opt/rocm/bin:$PATH \
45- LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
44+ ROCM_PATH=/opt/rocm \
45+ HIP_PATH=/opt/rocm \
46+ PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
47+ LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH \
48+ CMAKE_PREFIX_PATH=/opt/rocm
4649
4750# ###############################################################################
4851# System Dependencies Stage
@@ -57,21 +60,49 @@ COPY mellanox.repo rocm.repo /etc/yum.repos.d/
5760
5861# Install ROCm development tools
5962# Using individual packages instead of metapackages to avoid python3-wheel dependency issue
60- # hipcc is the HIP compiler (may be needed for building ROCm packages)
61- # rocm-device-libs provides the GPU device library required by clang for ROCm compilation
63+ # - rocm-llvm: LLVM compiler required by hipcc (provides /opt/rocm/llvm/bin/clang++)
64+ # - hipcc: HIP compiler wrapper
65+ # - hip-devel: HIP development headers
66+ # - rocm-device-libs: GPU device library required by clang for ROCm compilation
6267RUN dnf install -y --setopt=install_weak_deps=False \
68+ rocm-llvm \
6369 hipcc \
6470 hip-devel \
6571 hip-runtime-amd \
72+ rocthrust \
73+ hipsparse-devel \
74+ hipsparse \
75+ hipcub-devel \
76+ rocprim-devel \
77+ hipblaslt-devel \
78+ rocrand \
79+ hipfft \
80+ rocfft \
6681 rocm-cmake \
6782 rocm-device-libs \
6883 rocblas-devel \
6984 hipblas-devel \
7085 rocsolver-devel \
7186 hipsolver-devel && \
72- dnf clean all && rm -rf /var/cache/dnf/* && \
73- # hipcc installs to /opt/rocm-X.Y.Z/bin but we need /opt/rocm/bin/hipcc
74- ln -sf /opt/rocm-*/bin/hipcc /opt/rocm/bin/hipcc
87+ dnf clean all && rm -rf /var/cache/dnf/*
88+
89+ # Fix /opt/rocm symlink - base image has it pointing to /etc/alternatives/rocm
90+ # which doesn't contain the full ROCm installation. We need it to point to /opt/rocm-6.4.3
91+ RUN echo "=== Fixing ROCm symlink ===" && \
92+ echo "Current /opt/rocm points to:" && readlink /opt/rocm && \
93+ rm -f /opt/rocm && \
94+ ln -sf /opt/rocm-6.4.3 /opt/rocm && \
95+ echo "Fixed /opt/rocm now points to:" && readlink /opt/rocm && \
96+ ls -la /opt/rocm/ && \
97+ echo "=== ROCm symlink fixed ==="
98+
99+ # Verify ROCm/HIP toolchain is properly installed
100+ RUN echo "=== Verifying ROCm/HIP installation ===" && \
101+ echo "hipcc:" && ls -la /opt/rocm/bin/hipcc && \
102+ echo "clang++:" && ls -la /opt/rocm/lib/llvm/bin/clang++ && \
103+ echo "Testing hipcc:" && /opt/rocm/bin/hipcc --version && \
104+ echo "ROCm device libs:" && ls /opt/rocm/amdgcn/bitcode/ | head -5 && \
105+ echo "=== ROCm verification complete ==="
75106
76107# Install system packages (RDMA and build toolchain)
77108#
@@ -121,7 +152,8 @@ COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
121152
122153# Copy dependency files
123154# pylock.toml: All dependencies including ROCm PyTorch (compiled with --find-links)
124- COPY --chown=1001:0 pyproject.toml pylock.toml ./
155+ # requirements-special.txt: Packages needing --no-build-isolation (flash-attn)
156+ COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
125157
126158# Switch to user 1001 for pip installations
127159USER 1001
@@ -131,27 +163,55 @@ WORKDIR /opt/app-root/src
131163# This syncs the environment to match exactly what's in the lockfile
132164# pylock.toml was compiled with --find-links=https://download.pytorch.org/whl/rocm6.4
133165# so torch comes from ROCm index
134- #
135- # flash-attn requires torch at build time and GPU architecture info, so we:
136- # 1. First install torch from ROCm index
137- # 2. Set GPU_ARCHS so flash-attn knows what to build for (no GPU needed at build time)
138- # 3. Then sync all dependencies with --no-build-isolation
139- ENV UV_NO_CACHE=1 \
140- GPU_ARCHS="gfx90a;gfx942" \
141- PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
142- MAX_JOBS=32 \
143- CMAKE_BUILD_PARALLEL_LEVEL=32
144- RUN uv pip install --index-strategy=unsafe-best-match --index-url=https://download.pytorch.org/whl/rocm6.4 --extra-index-url=https://pypi.org/simple "torch==2.9.0+rocm6.4"
145- RUN uv pip sync --python-platform=linux --python-version=3.12 --no-build-isolation /tmp/deps/pylock.toml
166+ ENV UV_NO_CACHE=1
167+ RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
146168ENV UV_NO_CACHE=
147169
148170# Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
149171# TODO: use aipcc index
150172RUN pip install --retries 5 --timeout 300 --no-cache-dir \
151173 "git+https://github.com/opendatahub-io/kubeflow-sdk@main"
152174
153- # flash-attn is included as a transitive dependency from instructlab-training[rocm]
154- # in pylock.toml (version 2.8.3), so no separate install needed
175+ # Install flash-attn from requirements-special.txt
176+ # Requires:
177+ # - GPU_ARCHS: tells flash-attn which ROCm architectures to build for (no GPU needed at build time)
178+ # - PYTORCH_ROCM_ARCH: additional hint for PyTorch/ROCm
179+ # - MAX_JOBS/CMAKE_BUILD_PARALLEL_LEVEL: parallel kernel compilation (can be overridden via build-args)
180+ # - --no-build-isolation: use pre-installed torch for the build
181+ # - --no-deps: flash-attn deps already satisfied by pylock.toml
182+
183+ # Accept build args for parallelism (can be overridden by argfile.konflux.conf)
184+ ARG MAX_JOBS=16
185+ ARG CMAKE_BUILD_PARALLEL_LEVEL=8
186+
187+ # Set environment for flash-attn build
188+ ENV GPU_ARCHS="gfx90a;gfx942" \
189+ PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
190+ MAX_JOBS=${MAX_JOBS} \
191+ CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}
192+
193+ # Verify ROCm tools are accessible before building flash-attn
194+ # This runs in python-deps stage to ensure symlinks from system-deps are inherited
195+ RUN echo "=== Pre-build verification in python-deps stage ===" && \
196+ echo "Checking /opt/rocm/bin/hipcc:" && \
197+ ls -la /opt/rocm/bin/hipcc && \
198+ echo "Checking symlink target exists:" && \
199+ readlink -f /opt/rocm/bin/hipcc && \
200+ ls -la $(readlink -f /opt/rocm/bin/hipcc) && \
201+ echo "Testing hipcc execution:" && \
202+ /opt/rocm/bin/hipcc --version && \
203+ echo "=== Pre-build verification passed ==="
204+
205+ # Build flash-attn with verbose output to capture any errors
206+ RUN echo "=== Starting flash-attn build ===" && \
207+ echo "MAX_JOBS=${MAX_JOBS}" && \
208+ echo "CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}" && \
209+ echo "GPU_ARCHS=${GPU_ARCHS}" && \
210+ echo "ROCM_HOME=${ROCM_HOME}" && \
211+ echo "HIP_PATH=${HIP_PATH}" && \
212+ pip install --no-build-isolation --no-cache-dir --no-deps --verbose \
213+ $(grep "^flash-attn" /tmp/deps/requirements-special.txt) 2>&1 | tee /tmp/flash-attn-build.log && \
214+ echo "=== flash-attn build complete ==="
155215
156216# Fix permissions for OpenShift
157217ARG PYTHON_VERSION
@@ -189,10 +249,12 @@ RUN ldconfig
189249# FIPS-friendly: Remove uv from final image
190250RUN rm -f /opt/app-root/bin/uv
191251
192- # Environment variables for ROCm
252+ # Environment variables for ROCm (full paths for HIP/ROCm toolchain)
193253ENV ROCM_HOME=/opt/rocm \
194- PATH=/opt/rocm/bin:$PATH \
195- LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
254+ ROCM_PATH=/opt/rocm \
255+ HIP_PATH=/opt/rocm \
256+ PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
257+ LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH
196258
197259# Copy license file
198260COPY LICENSE.md /licenses/rocm-license.md
0 commit comments