Skip to content

Commit 99476c0

Browse files
try torch29 with FA 282 and konflux worker resource bump
1 parent 363bd4f commit 99476c0

File tree

5 files changed

+48
-28
lines changed

5 files changed

+48
-28
lines changed

.tekton/odh-training-rocm64-torch28-py312-rhel9-pull-request.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@ spec:
2121
timeouts:
2222
pipeline: 24h
2323
tasks: 20h
24+
taskRunSpecs:
25+
- pipelineTaskName: build-images
26+
stepSpecs:
27+
- name: build
28+
computeResources:
29+
requests:
30+
memory: 10Gi
31+
limits:
32+
memory: 10Gi
2433
params:
2534
- name: git-url
2635
value: '{{source_url}}'

.tekton/odh-training-rocm64-torch28-py312-rhel9-push.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@ spec:
2020
timeouts:
2121
pipeline: 24h
2222
tasks: 20h
23+
taskRunSpecs:
24+
- pipelineTaskName: build-images
25+
stepSpecs:
26+
- name: build
27+
computeResources:
28+
requests:
29+
memory: 10Gi
30+
limits:
31+
memory: 10Gi
2332
params:
2433
- name: git-url
2534
value: '{{source_url}}'

images/universal/training/rocm64-torch280-py312/Dockerfile

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ RUN pip install --no-cache-dir uv
2727
################################################################################
2828
FROM ${BASE_IMAGE} AS base
2929

30-
LABEL name="rocm:py312-rocm64-torch280" \
31-
summary="ROCm 6.4 Python 3.12 image with PyTorch 2.8.0" \
32-
description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.8.0) on UBI9" \
30+
LABEL name="rocm:py312-rocm64-torch290" \
31+
summary="ROCm 6.4 Python 3.12 image with PyTorch 2.9.0" \
32+
description="ROCm image combining minimal Jupyter workbench and runtime ML stack (ROCm 6.4, PyTorch 2.9.0) on UBI9" \
3333
io.k8s.display-name="ROCm 6.4 Python 3.12 (Workbench + Runtime)" \
3434
io.k8s.description="ROCm image: Jupyter workbench by default; runtime when command provided."
3535

@@ -140,17 +140,16 @@ ENV UV_NO_CACHE=
140140
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
141141
"git+https://github.com/opendatahub-io/kubeflow-sdk@main"
142142

143-
# TODO: Re-enable Flash Attention after confirming base image works
144143
# Install Flash Attention from original Dao-AILab repo
145144
# --no-build-isolation: Use already-installed torch instead of isolated env
146-
# USER 0
147-
# ENV GPU_ARCHS="gfx90a;gfx942"
148-
# RUN cd /tmp \
149-
# && git clone --depth 1 --branch v2.8.3 https://github.com/Dao-AILab/flash-attention.git \
150-
# && cd flash-attention \
151-
# && git submodule update --init \
152-
# && MAX_JOBS="16" pip install --no-build-isolation --no-cache-dir --no-deps . \
153-
# && cd / && rm -rf /tmp/flash-attention
145+
USER 0
146+
ENV GPU_ARCHS="gfx90a;gfx942"
147+
RUN cd /tmp \
148+
&& git clone --depth 1 --branch v2.8.2 https://github.com/Dao-AILab/flash-attention.git \
149+
&& cd flash-attention \
150+
&& git submodule update --init \
151+
&& MAX_JOBS="4" pip install --no-build-isolation --no-cache-dir --no-deps . \
152+
&& cd / && rm -rf /tmp/flash-attention
154153

155154

156155
# Fix permissions for OpenShift

images/universal/training/rocm64-torch280-py312/pylock.toml

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1895,15 +1895,17 @@ wheels = [{ url = "https://files.pythonhosted.org/packages/08/20/0f2523b9e50a805
18951895

18961896
[[packages]]
18971897
name = "pytorch-triton-rocm"
1898-
version = "3.4.0"
1899-
marker = "platform_machine == 'x86_64' and sys_platform == 'linux'"
1898+
version = "3.5.0"
1899+
marker = "sys_platform == 'linux'"
19001900
wheels = [
1901-
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.4.0-cp310-cp310-linux_x86_64.whl", hashes = { sha256 = "1ee0a5cf569175e63b43bc334dcaaf6f9b0d88eb455a452869c2bab14e1f7eb4" } },
1902-
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.4.0-cp311-cp311-linux_x86_64.whl", hashes = { sha256 = "b0362725d8e16d185251e3dcd48455ebf9cdaad2c26052bb47ef08a1d687ed20" } },
1903-
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.4.0-cp312-cp312-linux_x86_64.whl", hashes = { sha256 = "7afe951b9fc38f1a5b3a7b98bebbaa092bf51e6192b699b4fade9b1ad6fc9c2c" } },
1904-
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.4.0-cp313-cp313-linux_x86_64.whl", hashes = { sha256 = "1e7ccba3501fcd38e8cd8415f97a654043370e1fdc5a936bb75abe1bebeb94c9" } },
1905-
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.4.0-cp313-cp313t-linux_x86_64.whl", hashes = { sha256 = "c262cd42e38b6955391338cca1c3a779cceb8c51e4b45200d87305c870ef99d7" } },
1906-
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.4.0-cp39-cp39-linux_x86_64.whl", hashes = { sha256 = "7f26455680a068b04456c6977348d404ff31d2fef1d7472e980483db7acbe462" } },
1901+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp310-cp310-linux_x86_64.whl", hashes = { sha256 = "f2ea8cb272afcdb8b08a85eb947ed01d0921a56b26aee8648c6d1de1aaf4db08" } },
1902+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp311-cp311-linux_x86_64.whl", hashes = { sha256 = "b3d4624a662e0dfad60d679166047840415e16ed62df93ac9f8d3685dc9869d7" } },
1903+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp312-cp312-linux_x86_64.whl", hashes = { sha256 = "b3a209621d0433367c489e8dce90ebc4c7c9e3bfe1c2b7adc928344f8290d5f5" } },
1904+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp313-cp313-linux_x86_64.whl", hashes = { sha256 = "bd2fab6dfb077dfc11fb581c5d33a0ee2d43ea13415530d095d1d5326d85d8b2" } },
1905+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp313-cp313t-linux_x86_64.whl", hashes = { sha256 = "0922f3ed8cd09f7d7f3a62138808efc9cdf9c9ee2913e00786e948b10f091fb0" } },
1906+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp314-cp314-linux_x86_64.whl", hashes = { sha256 = "af67c42a8022fc4de3ddb707a201bd0cd49a41ca761bf3da0537b3e127e5b4d0" } },
1907+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp314-cp314t-linux_x86_64.whl", hashes = { sha256 = "cdf579cf1982ade33011e5eafe0f741a2bfa1560075b23412e41b681d1fb8ca4" } },
1908+
{ url = "https://download.pytorch.org/whl/pytorch_triton_rocm-3.5.0-cp39-cp39-linux_x86_64.whl", hashes = { sha256 = "041415bbb4f11f0bbad13a39374cd5e0b9b74a5c0864ef8f58c38b635689123d" } },
19071909
]
19081910

19091911
[[packages]]
@@ -2588,14 +2590,15 @@ wheels = [
25882590

25892591
[[packages]]
25902592
name = "torch"
2591-
version = "2.8.0+rocm6.4"
2593+
version = "2.9.0+rocm6.4"
25922594
wheels = [
2593-
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp310-cp310-manylinux_2_28_x86_64.whl", hashes = { sha256 = "1fb876f6c215282bbfe8f3dc6d17a3b8755865601fac44e7c036a3f55b84a315" } },
2594-
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp311-cp311-manylinux_2_28_x86_64.whl", hashes = { sha256 = "2d35c37d15a0eb61ceab94dffae9399cab1b4a0d64668e4405025b5ce65df033" } },
2595-
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp312-cp312-manylinux_2_28_x86_64.whl", hashes = { sha256 = "b2ee7e4967d4ad38689a38cefff2a60baa1e884fe61d96c390c187e525a02b3a" } },
2596-
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp313-cp313-manylinux_2_28_x86_64.whl", hashes = { sha256 = "262ebeac6905df801969fb6dbdc085c75c21c5609dc8357334ec0b0a242e50eb" } },
2597-
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp313-cp313t-manylinux_2_28_x86_64.whl", hashes = { sha256 = "5b7d7a8d0174ceb30d203bbf7e5cf04eb8d7aa9e8ed09521ca84a7f48979b4e9" } },
2598-
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp39-cp39-manylinux_2_28_x86_64.whl", hashes = { sha256 = "3bc4073e4aaed0f0d1e4c3cfdb14ccde502654bfacd02b0f2b9d4f7d3f6bd89b" } },
2595+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp310-cp310-manylinux_2_28_x86_64.whl", hashes = { sha256 = "400e63239a85e69c9c6dbae4c0e6c7bb01312f3ed5eef25f3b886ff73d7eb2ab" } },
2596+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp311-cp311-manylinux_2_28_x86_64.whl", hashes = { sha256 = "a29a9cd848f281ed9af6c1c5651437897ef3320d125818fac2f050399890b32b" } },
2597+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp312-cp312-manylinux_2_28_x86_64.whl", hashes = { sha256 = "bbb1d006c7a7f9f27447fd5c17674d408625e6e562a83f11f58460bb0acbd9c1" } },
2598+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp313-cp313-manylinux_2_28_x86_64.whl", hashes = { sha256 = "c8a8876e7c5bd8fdcc394289076d2b83295addad4f5ca60ea3865123d5dc9f76" } },
2599+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp313-cp313t-manylinux_2_28_x86_64.whl", hashes = { sha256 = "a025a602ce880e3a967cff80ce4c6c599e3f5b73195c18edc48b26ae2e5f1926" } },
2600+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp314-cp314-manylinux_2_28_x86_64.whl", hashes = { sha256 = "94a2c768ed5b76c53ff152e43baf081fbab965a94ceac972c15c814ece7810d0" } },
2601+
{ url = "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp314-cp314t-manylinux_2_28_x86_64.whl", hashes = { sha256 = "54b34f01b05c629814573c2b5f0a86f910b934b0807cd063cd7f8d0abdd6a617" } },
25992602
]
26002603

26012604
[[packages]]

images/universal/training/rocm64-torch280-py312/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ dependencies = [
144144
# ===================================================================
145145

146146
# PyTorch with ROCm 6.4 (local version suffix)
147-
"torch==2.8.0+rocm6.4",
147+
"torch==2.9.0+rocm6.4",
148148

149149
# ML Training and Inference Libraries
150150
"peft==0.17.0",

0 commit comments

Comments
 (0)