Skip to content

Commit b519083

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 9cf7cac + 92c8408 commit b519083

File tree

4 files changed

+12
-34
lines changed

4 files changed

+12
-34
lines changed

.tekton/training-rocm-push.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ spec:
2424
- name: revision
2525
value: '{{revision}}'
2626
- name: output-image
27-
value: quay.io/modh/training:py311-rocm61-torch241
27+
value: quay.io/modh/training:py311-rocm62-torch241
2828
- name: additional-tag
29-
value: py311-rocm61-torch241-{{revision}}
29+
value: py311-rocm62-torch241-{{revision}}
3030
- name: dockerfile
3131
value: Dockerfile
3232
- name: path-context

images/runtime/training/rocm/Dockerfile

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
## Global Args ######################################################
2-
ARG IMAGE_TAG=1-77.1729776556
2+
ARG IMAGE_TAG=9.5-1737537151
33
ARG PYTHON_VERSION=311
44

55
# use UBI9 latest
66
FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} AS base
77

8-
LABEL name="training:py311-rocm61-torch241" \
9-
summary="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
10-
description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
11-
io.k8s.display-name="ROCm 6.1 Python 3.11 PyTorch 2.4.1 base image for Training" \
12-
io.k8s.description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
8+
LABEL name="training:py311-rocm62-torch241" \
9+
summary="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
10+
description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
11+
io.k8s.display-name="ROCm 6.2 Python 3.11 PyTorch 2.4.1 base image for Training" \
12+
io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
1313
authoritative-source-url="https://github.com/opendatahub-io/distributed-workloads"
1414

1515
# Copy license
@@ -25,8 +25,8 @@ RUN pip install --no-cache-dir --upgrade requests==2.32.3
2525
# Install ROCm
2626
WORKDIR /opt/app-root/bin
2727

28-
ARG ROCM_VERSION=6.1.2
29-
ARG AMDGPU_VERSION=6.1.2
28+
ARG ROCM_VERSION=6.2.4
29+
ARG AMDGPU_VERSION=6.2.4
3030

3131
RUN <<EOF
3232
cat <<EOD > /etc/yum.repos.d/rocm.repo
@@ -48,7 +48,7 @@ gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
4848
EOD
4949
EOF
5050

51-
RUN dnf -y install rocm && dnf clean all && rm -rf /var/cache/dnf
51+
RUN dnf install -y rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf
5252

5353
# Install Python packages
5454

images/runtime/training/rocm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ ROCm enabled container image for Training in OpenShift AI.
55
It includes the following layers:
66
* UBI 9
77
* Python 3.11
8-
* ROCm 6.1
8+
* ROCm 6.2
99
* PyTorch 2.4.1

tests/kfto/kfto_mnist_training_test.go

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"bytes"
2121
"fmt"
2222
"testing"
23-
"time"
2423

2524
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
2625
. "github.com/onsi/gomega"
@@ -83,27 +82,6 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
8382
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
8483
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
8584

86-
// Verify GPU utilization
87-
if IsOpenShift(test) && accelerator == NVIDIA {
88-
trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()})
89-
test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node
90-
91-
for _, trainingPod := range trainingPods {
92-
// Check that GPUs for training pods were utilized recently
93-
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute).
94-
Should(
95-
And(
96-
HaveLen(numProcPerNode),
97-
ContainElement(
98-
// Check that at least some GPU was utilized on more than 20%
99-
HaveField("Value", BeNumerically(">", 20)),
100-
),
101-
),
102-
)
103-
}
104-
test.T().Log("All GPUs were successfully utilized")
105-
}
106-
10785
// Make sure the PyTorch job succeeded
10886
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
10987
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)

0 commit comments

Comments
 (0)