Skip to content

Commit 09618dc

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 03611c3 + 083084b commit 09618dc

22 files changed

+3484
-1207
lines changed

.tekton/odh-training-rocm64-torch29-py312-rhel9-pull-request.yaml

Lines changed: 653 additions & 0 deletions
Large diffs are not rendered by default.

.tekton/odh-training-rocm64-torch29-py312-rhel9-push.yaml

Lines changed: 650 additions & 0 deletions
Large diffs are not rendered by default.

go.mod

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,28 @@ module github.com/opendatahub-io/distributed-workloads
22

33
go 1.24.2
44

5+
// Remove replace when Training operator v1 tests are removed
6+
replace (
7+
github.com/kubeflow/training-operator => github.com/kubeflow/training-operator v1.7.0
8+
sigs.k8s.io/kueue => sigs.k8s.io/kueue v0.6.2
9+
)
10+
511
require (
612
github.com/kubeflow/trainer/v2 v2.0.0
713
github.com/kubeflow/training-operator v1.7.0
814
github.com/matoous/go-nanoid/v2 v2.1.0
9-
github.com/onsi/gomega v1.37.0
10-
github.com/openshift/api v0.0.0-20251015095338-264e80a2b6e7
15+
github.com/onsi/gomega v1.38.2
16+
github.com/openshift/api v0.0.0-20251124165233-999c45c0835a
1117
github.com/openshift/client-go v0.0.0-20251015124057-db0dee36e235
12-
github.com/prometheus/client_golang v1.22.0
13-
github.com/prometheus/common v0.62.0
18+
github.com/openshift/kueue-operator v0.0.0-20251202204851-958c48004dad
19+
github.com/prometheus/client_golang v1.23.0
20+
github.com/prometheus/common v0.65.0
1421
github.com/ray-project/kuberay/ray-operator v1.3.0
1522
k8s.io/api v0.34.1
1623
k8s.io/apimachinery v0.34.1
1724
k8s.io/client-go v0.34.1
18-
sigs.k8s.io/kueue v0.6.2
25+
sigs.k8s.io/jobset v0.9.1
26+
sigs.k8s.io/kueue v0.10.2
1927
)
2028

2129
require (
@@ -25,9 +33,9 @@ require (
2533
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
2634
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
2735
github.com/go-logr/logr v1.4.3 // indirect
28-
github.com/go-openapi/jsonpointer v0.21.0 // indirect
36+
github.com/go-openapi/jsonpointer v0.21.1 // indirect
2937
github.com/go-openapi/jsonreference v0.21.0 // indirect
30-
github.com/go-openapi/swag v0.23.0 // indirect
38+
github.com/go-openapi/swag v0.23.1 // indirect
3139
github.com/gogo/protobuf v1.3.2 // indirect
3240
github.com/google/gnostic-models v0.7.0 // indirect
3341
github.com/google/go-cmp v0.7.0 // indirect
@@ -41,23 +49,20 @@ require (
4149
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4250
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
4351
github.com/pkg/errors v0.9.1 // indirect
44-
github.com/prometheus/client_model v0.6.1 // indirect
45-
github.com/prometheus/procfs v0.15.1 // indirect
52+
github.com/prometheus/client_model v0.6.2 // indirect
53+
github.com/prometheus/procfs v0.16.1 // indirect
4654
github.com/sirupsen/logrus v1.9.3 // indirect
4755
github.com/spf13/pflag v1.0.6 // indirect
4856
github.com/x448/float16 v0.8.4 // indirect
49-
go.etcd.io/etcd/client/pkg/v3 v3.5.21 // indirect
50-
go.etcd.io/etcd/client/v3 v3.5.21 // indirect
51-
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 // indirect
5257
go.yaml.in/yaml/v2 v2.4.2 // indirect
5358
go.yaml.in/yaml/v3 v3.0.4 // indirect
54-
golang.org/x/net v0.38.0 // indirect
55-
golang.org/x/oauth2 v0.27.0 // indirect
56-
golang.org/x/sys v0.32.0 // indirect
57-
golang.org/x/term v0.30.0 // indirect
58-
golang.org/x/text v0.23.0 // indirect
59-
golang.org/x/time v0.10.0 // indirect
60-
google.golang.org/protobuf v1.36.5 // indirect
59+
golang.org/x/net v0.47.0 // indirect
60+
golang.org/x/oauth2 v0.30.0 // indirect
61+
golang.org/x/sys v0.38.0 // indirect
62+
golang.org/x/term v0.37.0 // indirect
63+
golang.org/x/text v0.31.0 // indirect
64+
golang.org/x/time v0.11.0 // indirect
65+
google.golang.org/protobuf v1.36.7 // indirect
6166
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
6267
gopkg.in/inf.v0 v0.9.1 // indirect
6368
gopkg.in/yaml.v2 v2.4.0 // indirect
@@ -66,10 +71,8 @@ require (
6671
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect
6772
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
6873
sigs.k8s.io/controller-runtime v0.21.0 // indirect
69-
sigs.k8s.io/jobset v0.8.2 // indirect
7074
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
7175
sigs.k8s.io/randfill v1.0.0 // indirect
72-
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
7376
sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
7477
sigs.k8s.io/yaml v1.6.0 // indirect
7578
)

go.sum

Lines changed: 101 additions & 98 deletions
Large diffs are not rendered by default.

images/universal/training/rocm64-torch280-py312/requirements-special.txt

Lines changed: 0 additions & 8 deletions
This file was deleted.

images/universal/training/rocm64-torch280-py312/Dockerfile renamed to images/universal/training/rocm64-torch290-py312/Dockerfile

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ RUN pip install --no-cache-dir uv
2727
################################################################################
2828
FROM ${BASE_IMAGE} AS base
2929

30-
LABEL name="rocm:py312-rocm64-torch280" \
31-
summary="ROCm 6.4 Python 3.12 image with PyTorch 2.8.0" \
32-
description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.8.0) on UBI9" \
30+
LABEL name="rocm:py312-rocm64-torch290" \
31+
summary="ROCm 6.4 Python 3.12 image with PyTorch 2.9.0" \
32+
description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.9.0) on UBI9" \
3333
io.k8s.display-name="ROCm 6.4 Python 3.12 (Workbench + Runtime)" \
3434
io.k8s.description="ROCm image: Jupyter workbench by default; runtime when command provided."
3535

@@ -39,10 +39,13 @@ COPY LICENSE.md /licenses/rocm-license.md
3939
USER 0
4040
WORKDIR /opt/app-root/bin
4141

42-
# Environment variables for ROCm
42+
# Environment variables for ROCm (full paths for HIP/ROCm toolchain)
4343
ENV ROCM_HOME=/opt/rocm \
44-
PATH=/opt/rocm/bin:$PATH \
45-
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
44+
ROCM_PATH=/opt/rocm \
45+
HIP_PATH=/opt/rocm \
46+
PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
47+
LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH \
48+
CMAKE_PREFIX_PATH=/opt/rocm
4649

4750
################################################################################
4851
# System Dependencies Stage
@@ -57,21 +60,49 @@ COPY mellanox.repo rocm.repo /etc/yum.repos.d/
5760

5861
# Install ROCm development tools
5962
# Using individual packages instead of metapackages to avoid python3-wheel dependency issue
60-
# hipcc is the HIP compiler needed for flash-attention build
61-
# rocm-device-libs provides the GPU device library required by clang for ROCm compilation
63+
# - rocm-llvm: LLVM compiler required by hipcc (provides /opt/rocm/llvm/bin/clang++)
64+
# - hipcc: HIP compiler wrapper
65+
# - hip-devel: HIP development headers
66+
# - rocm-device-libs: GPU device library required by clang for ROCm compilation
6267
RUN dnf install -y --setopt=install_weak_deps=False \
68+
rocm-llvm \
6369
hipcc \
6470
hip-devel \
6571
hip-runtime-amd \
72+
rocthrust \
73+
hipsparse-devel \
74+
hipsparse \
75+
hipcub-devel \
76+
rocprim-devel \
77+
hipblaslt-devel \
78+
rocrand \
79+
hipfft \
80+
rocfft \
6681
rocm-cmake \
6782
rocm-device-libs \
6883
rocblas-devel \
6984
hipblas-devel \
7085
rocsolver-devel \
7186
hipsolver-devel && \
72-
dnf clean all && rm -rf /var/cache/dnf/* && \
73-
# hipcc installs to /opt/rocm-X.Y.Z/bin but we need /opt/rocm/bin/hipcc
74-
ln -sf /opt/rocm-*/bin/hipcc /opt/rocm/bin/hipcc
87+
dnf clean all && rm -rf /var/cache/dnf/*
88+
89+
# Fix /opt/rocm symlink - base image has it pointing to /etc/alternatives/rocm
90+
# which doesn't contain the full ROCm installation. We need it to point to /opt/rocm-6.4.3
91+
RUN echo "=== Fixing ROCm symlink ===" && \
92+
echo "Current /opt/rocm points to:" && readlink /opt/rocm && \
93+
rm -f /opt/rocm && \
94+
ln -sf /opt/rocm-6.4.3 /opt/rocm && \
95+
echo "Fixed /opt/rocm now points to:" && readlink /opt/rocm && \
96+
ls -la /opt/rocm/ && \
97+
echo "=== ROCm symlink fixed ==="
98+
99+
# Verify ROCm/HIP toolchain is properly installed
100+
RUN echo "=== Verifying ROCm/HIP installation ===" && \
101+
echo "hipcc:" && ls -la /opt/rocm/bin/hipcc && \
102+
echo "clang++:" && ls -la /opt/rocm/lib/llvm/bin/clang++ && \
103+
echo "Testing hipcc:" && /opt/rocm/bin/hipcc --version && \
104+
echo "ROCm device libs:" && ls /opt/rocm/amdgcn/bitcode/ | head -5 && \
105+
echo "=== ROCm verification complete ==="
75106

76107
# Install system packages (RDMA and build toolchain)
77108
#
@@ -121,7 +152,8 @@ COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
121152

122153
# Copy dependency files
123154
# pylock.toml: All dependencies including ROCm PyTorch (compiled with --find-links)
124-
COPY --chown=1001:0 pyproject.toml pylock.toml ./
155+
# requirements-special.txt: Packages needing --no-build-isolation (flash-attn)
156+
COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
125157

126158
# Switch to user 1001 for pip installations
127159
USER 1001
@@ -140,18 +172,46 @@ ENV UV_NO_CACHE=
140172
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
141173
"git+https://github.com/opendatahub-io/kubeflow-sdk@main"
142174

143-
# TODO: Re-enable Flash Attention after confirming base image works
144-
# Install Flash Attention from original Dao-AILab repo
145-
# --no-build-isolation: Use already-installed torch instead of isolated env
146-
# USER 0
147-
# ENV GPU_ARCHS="gfx90a;gfx942"
148-
# RUN cd /tmp \
149-
# && git clone --depth 1 --branch v2.8.3 https://github.com/Dao-AILab/flash-attention.git \
150-
# && cd flash-attention \
151-
# && git submodule update --init \
152-
# && MAX_JOBS="16" pip install --no-build-isolation --no-cache-dir --no-deps . \
153-
# && cd / && rm -rf /tmp/flash-attention
154-
175+
# Install flash-attn from requirements-special.txt
176+
# Requires:
177+
# - GPU_ARCHS: tells flash-attn which ROCm architectures to build for (no GPU needed at build time)
178+
# - PYTORCH_ROCM_ARCH: additional hint for PyTorch/ROCm
179+
# - MAX_JOBS/CMAKE_BUILD_PARALLEL_LEVEL: parallel kernel compilation (can be overridden via build-args)
180+
# - --no-build-isolation: use pre-installed torch for the build
181+
# - --no-deps: flash-attn deps already satisfied by pylock.toml
182+
183+
# Accept build args for parallelism (can be overridden by argfile.konflux.conf)
184+
ARG MAX_JOBS=16
185+
ARG CMAKE_BUILD_PARALLEL_LEVEL=8
186+
187+
# Set environment for flash-attn build
188+
ENV GPU_ARCHS="gfx90a;gfx942" \
189+
PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
190+
MAX_JOBS=${MAX_JOBS} \
191+
CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}
192+
193+
# Verify ROCm tools are accessible before building flash-attn
194+
# This runs in python-deps stage to ensure symlinks from system-deps are inherited
195+
RUN echo "=== Pre-build verification in python-deps stage ===" && \
196+
echo "Checking /opt/rocm/bin/hipcc:" && \
197+
ls -la /opt/rocm/bin/hipcc && \
198+
echo "Checking symlink target exists:" && \
199+
readlink -f /opt/rocm/bin/hipcc && \
200+
ls -la $(readlink -f /opt/rocm/bin/hipcc) && \
201+
echo "Testing hipcc execution:" && \
202+
/opt/rocm/bin/hipcc --version && \
203+
echo "=== Pre-build verification passed ==="
204+
205+
# Build flash-attn with verbose output to capture any errors
206+
RUN echo "=== Starting flash-attn build ===" && \
207+
echo "MAX_JOBS=${MAX_JOBS}" && \
208+
echo "CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}" && \
209+
echo "GPU_ARCHS=${GPU_ARCHS}" && \
210+
echo "ROCM_HOME=${ROCM_HOME}" && \
211+
echo "HIP_PATH=${HIP_PATH}" && \
212+
pip install --no-build-isolation --no-cache-dir --no-deps --verbose \
213+
$(grep "^flash-attn" /tmp/deps/requirements-special.txt) 2>&1 | tee /tmp/flash-attn-build.log && \
214+
echo "=== flash-attn build complete ==="
155215

156216
# Fix permissions for OpenShift
157217
ARG PYTHON_VERSION
@@ -189,10 +249,12 @@ RUN ldconfig
189249
# FIPS-friendly: Remove uv from final image
190250
RUN rm -f /opt/app-root/bin/uv
191251

192-
# Environment variables for ROCm
252+
# Environment variables for ROCm (full paths for HIP/ROCm toolchain)
193253
ENV ROCM_HOME=/opt/rocm \
194-
PATH=/opt/rocm/bin:$PATH \
195-
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
254+
ROCM_PATH=/opt/rocm \
255+
HIP_PATH=/opt/rocm \
256+
PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH \
257+
LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH
196258

197259
# Copy license file
198260
COPY LICENSE.md /licenses/rocm-license.md

images/universal/training/rocm64-torch280-py312/LICENSE.md renamed to images/universal/training/rocm64-torch290-py312/LICENSE.md

File renamed without changes.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
MAX_JOBS=32
2+
CMAKE_BUILD_PARALLEL_LEVEL=32

images/universal/training/rocm64-torch280-py312/entrypoint-universal.sh renamed to images/universal/training/rocm64-torch290-py312/entrypoint-universal.sh

File renamed without changes.

images/universal/training/rocm64-torch280-py312/mellanox.repo renamed to images/universal/training/rocm64-torch290-py312/mellanox.repo

File renamed without changes.

0 commit comments

Comments
 (0)