Skip to content

Commit 7c732f8

Browse files
committed
Merge remote-tracking branch 'upstream/main' into rhoai-3.2
2 parents 43f1bf7 + 2a68960 commit 7c732f8

11 files changed

+75
-85
lines changed

manifests/rhoai/kustomization.yaml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ configurations:
1515
- params.yaml
1616

1717
replacements:
18+
# Replace controller image
1819
- source:
1920
kind: ConfigMap
2021
name: rhoai-config
@@ -27,6 +28,71 @@ replacements:
2728
fieldPaths:
2829
- spec.template.spec.containers.0.image
2930

31+
# Replace image for torch-distributed-rocm
32+
- source:
33+
kind: ConfigMap
34+
name: rhoai-config
35+
version: v1
36+
fieldPath: data.odh-training-rocm64-torch28-py312-image
37+
targets:
38+
- select:
39+
kind: ClusterTrainingRuntime
40+
name: torch-distributed-rocm
41+
fieldPaths:
42+
- spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
43+
44+
# Replace image for torch-distributed-th03-cuda128-torch28-py312
45+
- source:
46+
kind: ConfigMap
47+
name: rhoai-config
48+
version: v1
49+
fieldPath: data.odh-training-cuda128-torch28-py312-image
50+
targets:
51+
- select:
52+
kind: ClusterTrainingRuntime
53+
name: torch-distributed-th03-cuda128-torch28-py312
54+
fieldPaths:
55+
- spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
56+
57+
# Replace image for torch-distributed
58+
- source:
59+
kind: ConfigMap
60+
name: rhoai-config
61+
version: v1
62+
fieldPath: data.odh-training-cuda128-torch28-py312-image
63+
targets:
64+
- select:
65+
kind: ClusterTrainingRuntime
66+
name: torch-distributed
67+
fieldPaths:
68+
- spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
69+
70+
# Replace image for training-hub-th03-cuda128-torch28-py312
71+
- source:
72+
kind: ConfigMap
73+
name: rhoai-config
74+
version: v1
75+
fieldPath: data.odh-training-cuda128-torch28-py312-image
76+
targets:
77+
- select:
78+
kind: ClusterTrainingRuntime
79+
name: training-hub03-cuda128-torch28-py312
80+
fieldPaths:
81+
- spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
82+
83+
# Replace image for training-hub
84+
- source:
85+
kind: ConfigMap
86+
name: rhoai-config
87+
version: v1
88+
fieldPath: data.odh-training-cuda128-torch28-py312-image
89+
targets:
90+
- select:
91+
kind: ClusterTrainingRuntime
92+
name: training-hub
93+
fieldPaths:
94+
- spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
95+
3096
# Labels to add to all resources and selectors.
3197
labels:
3298
- includeSelectors: true

manifests/rhoai/params.env

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0
2+
odh-training-cuda128-torch28-py312-image=quay.io/modh/training:py312-cuda128-torch280
3+
odh-training-rocm64-torch28-py312-image=quay.io/modh/training:py312-rocm64-torch280

manifests/rhoai/runtimes/kustomization.yaml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
apiVersion: kustomize.config.k8s.io/v1beta1
22
kind: Kustomization
33
resources:
4-
- torch_cuda_241.yaml
5-
- torch_cuda_251.yaml
6-
- torch_rocm_241.yaml
7-
- torch_rocm_251.yaml
4+
- torch_distributed_rocm.yaml
85
- torch_distributed_th03_cuda128_torch28_py312.yaml
96
- torch_distributed.yaml
107
- training_hub_th03_cuda128_torch28_py312.yaml

manifests/rhoai/runtimes/torch_cuda_251.yaml

Lines changed: 0 additions & 25 deletions
This file was deleted.

manifests/rhoai/runtimes/torch_distributed.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ spec:
2222
spec:
2323
containers:
2424
- name: node
25-
image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
25+
image: quay.io/modh/training:py312-cuda128-torch280

manifests/rhoai/runtimes/torch_cuda_241.yaml renamed to manifests/rhoai/runtimes/torch_distributed_rocm.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: trainer.kubeflow.org/v1alpha1
22
kind: ClusterTrainingRuntime
33
metadata:
4-
name: torch-cuda-241
4+
name: torch-distributed-rocm
55
labels:
66
trainer.kubeflow.org/framework: torch
77
spec:
@@ -22,4 +22,4 @@ spec:
2222
spec:
2323
containers:
2424
- name: node
25-
image: quay.io/modh/training:py311-cuda121-torch241
25+
image: quay.io/modh/training:py312-rocm64-torch280

manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ spec:
2222
spec:
2323
containers:
2424
- name: node
25-
image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
25+
image: quay.io/modh/training:py312-cuda128-torch280

manifests/rhoai/runtimes/torch_rocm_241.yaml

Lines changed: 0 additions & 25 deletions
This file was deleted.

manifests/rhoai/runtimes/torch_rocm_251.yaml

Lines changed: 0 additions & 25 deletions
This file was deleted.

manifests/rhoai/runtimes/training_hub.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ spec:
2222
spec:
2323
containers:
2424
- name: node
25-
image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
25+
image: quay.io/modh/training:py312-cuda128-torch280

0 commit comments

Comments
 (0)