Skip to content

Commit 495a1ee

Browse files
further updates to fa installation
1 parent 7a13fe1 commit 495a1ee

File tree

4 files changed

+1297
-1303
lines changed

4 files changed

+1297
-1303
lines changed

images/universal/training/rocm64-torch290-py312/Dockerfile

Lines changed: 88 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,13 @@ COPY LICENSE.md /licenses/rocm-license.md
3939
USER 0
4040
WORKDIR /opt/app-root/bin
4141

42-
# Environment variables for ROCm
42+
# Environment variables for ROCm (full paths for HIP/ROCm toolchain)
4343
ENV ROCM_HOME=/opt/rocm \
44-
PATH=/opt/rocm/bin:$PATH \
45-
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
44+
ROCM_PATH=/opt/rocm \
45+
HIP_PATH=/opt/rocm \
46+
PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
47+
LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH \
48+
CMAKE_PREFIX_PATH=/opt/rocm
4649

4750
################################################################################
4851
# System Dependencies Stage
@@ -57,21 +60,49 @@ COPY mellanox.repo rocm.repo /etc/yum.repos.d/
5760

5861
# Install ROCm development tools
5962
# Using individual packages instead of metapackages to avoid python3-wheel dependency issue
60-
# hipcc is the HIP compiler (may be needed for building ROCm packages)
61-
# rocm-device-libs provides the GPU device library required by clang for ROCm compilation
63+
# - rocm-llvm: LLVM compiler required by hipcc (provides /opt/rocm/llvm/bin/clang++)
64+
# - hipcc: HIP compiler wrapper
65+
# - hip-devel: HIP development headers
66+
# - rocm-device-libs: GPU device library required by clang for ROCm compilation
6267
RUN dnf install -y --setopt=install_weak_deps=False \
68+
rocm-llvm \
6369
hipcc \
6470
hip-devel \
6571
hip-runtime-amd \
72+
rocthrust \
73+
hipsparse-devel \
74+
hipsparse \
75+
hipcub-devel \
76+
rocprim-devel \
77+
hipblaslt-devel \
78+
rocrand \
79+
hipfft \
80+
rocfft \
6681
rocm-cmake \
6782
rocm-device-libs \
6883
rocblas-devel \
6984
hipblas-devel \
7085
rocsolver-devel \
7186
hipsolver-devel && \
72-
dnf clean all && rm -rf /var/cache/dnf/* && \
73-
# hipcc installs to /opt/rocm-X.Y.Z/bin but we need /opt/rocm/bin/hipcc
74-
ln -sf /opt/rocm-*/bin/hipcc /opt/rocm/bin/hipcc
87+
dnf clean all && rm -rf /var/cache/dnf/*
88+
89+
# Fix /opt/rocm symlink - base image has it pointing to /etc/alternatives/rocm
90+
# which doesn't contain the full ROCm installation. We need it to point to /opt/rocm-6.4.3
91+
RUN echo "=== Fixing ROCm symlink ===" && \
92+
echo "Current /opt/rocm points to:" && readlink /opt/rocm && \
93+
rm -f /opt/rocm && \
94+
ln -sf /opt/rocm-6.4.3 /opt/rocm && \
95+
echo "Fixed /opt/rocm now points to:" && readlink /opt/rocm && \
96+
ls -la /opt/rocm/ && \
97+
echo "=== ROCm symlink fixed ==="
98+
99+
# Verify ROCm/HIP toolchain is properly installed
100+
RUN echo "=== Verifying ROCm/HIP installation ===" && \
101+
echo "hipcc:" && ls -la /opt/rocm/bin/hipcc && \
102+
echo "clang++:" && ls -la /opt/rocm/lib/llvm/bin/clang++ && \
103+
echo "Testing hipcc:" && /opt/rocm/bin/hipcc --version && \
104+
echo "ROCm device libs:" && ls /opt/rocm/amdgcn/bitcode/ | head -5 && \
105+
echo "=== ROCm verification complete ==="
75106

76107
# Install system packages (RDMA and build toolchain)
77108
#
@@ -121,7 +152,8 @@ COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
121152

122153
# Copy dependency files
123154
# pylock.toml: All dependencies including ROCm PyTorch (compiled with --find-links)
124-
COPY --chown=1001:0 pyproject.toml pylock.toml ./
155+
# requirements-special.txt: Packages needing --no-build-isolation (flash-attn)
156+
COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
125157

126158
# Switch to user 1001 for pip installations
127159
USER 1001
@@ -131,27 +163,55 @@ WORKDIR /opt/app-root/src
131163
# This syncs the environment to match exactly what's in the lockfile
132164
# pylock.toml was compiled with --find-links=https://download.pytorch.org/whl/rocm6.4
133165
# so torch comes from ROCm index
134-
#
135-
# flash-attn requires torch at build time and GPU architecture info, so we:
136-
# 1. First install torch from ROCm index
137-
# 2. Set GPU_ARCHS so flash-attn knows what to build for (no GPU needed at build time)
138-
# 3. Then sync all dependencies with --no-build-isolation
139-
ENV UV_NO_CACHE=1 \
140-
GPU_ARCHS="gfx90a;gfx942" \
141-
PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
142-
MAX_JOBS=32 \
143-
CMAKE_BUILD_PARALLEL_LEVEL=32
144-
RUN uv pip install --index-strategy=unsafe-best-match --index-url=https://download.pytorch.org/whl/rocm6.4 --extra-index-url=https://pypi.org/simple "torch==2.9.0+rocm6.4"
145-
RUN uv pip sync --python-platform=linux --python-version=3.12 --no-build-isolation /tmp/deps/pylock.toml
166+
ENV UV_NO_CACHE=1
167+
RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
146168
ENV UV_NO_CACHE=
147169

148170
# Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
149171
# TODO: use aipcc index
150172
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
151173
"git+https://github.com/opendatahub-io/kubeflow-sdk@main"
152174

153-
# flash-attn is included as a transitive dependency from instructlab-training[rocm]
154-
# in pylock.toml (version 2.8.3), so no separate install needed
175+
# Install flash-attn from requirements-special.txt
176+
# Requires:
177+
# - GPU_ARCHS: tells flash-attn which ROCm architectures to build for (no GPU needed at build time)
178+
# - PYTORCH_ROCM_ARCH: additional hint for PyTorch/ROCm
179+
# - MAX_JOBS/CMAKE_BUILD_PARALLEL_LEVEL: parallel kernel compilation (can be overridden via build-args)
180+
# - --no-build-isolation: use pre-installed torch for the build
181+
# - --no-deps: flash-attn deps already satisfied by pylock.toml
182+
183+
# Accept build args for parallelism (can be overridden by argfile.konflux.conf)
184+
ARG MAX_JOBS=16
185+
ARG CMAKE_BUILD_PARALLEL_LEVEL=8
186+
187+
# Set environment for flash-attn build
188+
ENV GPU_ARCHS="gfx90a;gfx942" \
189+
PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
190+
MAX_JOBS=${MAX_JOBS} \
191+
CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}
192+
193+
# Verify ROCm tools are accessible before building flash-attn
194+
# This runs in python-deps stage to ensure symlinks from system-deps are inherited
195+
RUN echo "=== Pre-build verification in python-deps stage ===" && \
196+
echo "Checking /opt/rocm/bin/hipcc:" && \
197+
ls -la /opt/rocm/bin/hipcc && \
198+
echo "Checking symlink target exists:" && \
199+
readlink -f /opt/rocm/bin/hipcc && \
200+
ls -la $(readlink -f /opt/rocm/bin/hipcc) && \
201+
echo "Testing hipcc execution:" && \
202+
/opt/rocm/bin/hipcc --version && \
203+
echo "=== Pre-build verification passed ==="
204+
205+
# Build flash-attn with verbose output to capture any errors
206+
RUN echo "=== Starting flash-attn build ===" && \
207+
echo "MAX_JOBS=${MAX_JOBS}" && \
208+
echo "CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}" && \
209+
echo "GPU_ARCHS=${GPU_ARCHS}" && \
210+
echo "ROCM_HOME=${ROCM_HOME}" && \
211+
echo "HIP_PATH=${HIP_PATH}" && \
212+
pip install --no-build-isolation --no-cache-dir --no-deps --verbose \
213+
$(grep "^flash-attn" /tmp/deps/requirements-special.txt) 2>&1 | tee /tmp/flash-attn-build.log && \
214+
echo "=== flash-attn build complete ==="
155215

156216
# Fix permissions for OpenShift
157217
ARG PYTHON_VERSION
@@ -189,10 +249,12 @@ RUN ldconfig
189249
# FIPS-friendly: Remove uv from final image
190250
RUN rm -f /opt/app-root/bin/uv
191251

192-
# Environment variables for ROCm
252+
# Environment variables for ROCm (full paths for HIP/ROCm toolchain)
193253
ENV ROCM_HOME=/opt/rocm \
194-
PATH=/opt/rocm/bin:$PATH \
195-
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
254+
ROCM_PATH=/opt/rocm \
255+
HIP_PATH=/opt/rocm \
256+
PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
257+
LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH
196258

197259
# Copy license file
198260
COPY LICENSE.md /licenses/rocm-license.md

0 commit comments

Comments
 (0)