opendatahub-io · sutaakar · Dec 9, 2025 · Dec 5, 2025 · robert-bell · Dec 9, 2025
diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml
@@ -15,6 +15,7 @@ configurations:
 - params.yaml
 
 replacements:
+# Replace controller image
 - source:
     kind: ConfigMap
     name: rhoai-config
@@ -27,6 +28,71 @@ replacements:
     fieldPaths:
     - spec.template.spec.containers.0.image
 
+# Replace image for torch-distributed-rocm
+- source:
+    kind: ConfigMap
+    name: rhoai-config
+    version: v1
+    fieldPath: data.odh-training-rocm64-torch28-py312-image
+  targets:
+  - select:
+      kind: ClusterTrainingRuntime
+      name: torch-distributed-rocm
+    fieldPaths:
+    - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
+
+# Replace image for torch-distributed-th03-cuda128-torch28-py312
+- source:
+    kind: ConfigMap
+    name: rhoai-config
+    version: v1
+    fieldPath: data.odh-training-cuda128-torch28-py312-image
+  targets:
+  - select:
+      kind: ClusterTrainingRuntime
+      name: torch-distributed-th03-cuda128-torch28-py312
+    fieldPaths:
+    - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
+
+# Replace image for torch-distributed
+- source:
+    kind: ConfigMap
+    name: rhoai-config
+    version: v1
+    fieldPath: data.odh-training-cuda128-torch28-py312-image
+  targets:
+  - select:
+      kind: ClusterTrainingRuntime
+      name: torch-distributed
+    fieldPaths:
+    - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
+
+# Replace image for training-hub-th03-cuda128-torch28-py312
+- source:
+    kind: ConfigMap
+    name: rhoai-config
+    version: v1
+    fieldPath: data.odh-training-cuda128-torch28-py312-image
+  targets:
+  - select:
+      kind: ClusterTrainingRuntime
+      name: training-hub03-cuda128-torch28-py312
+    fieldPaths:
+    - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
+
+# Replace image for training-hub
+- source:
+    kind: ConfigMap
+    name: rhoai-config
+    version: v1
+    fieldPath: data.odh-training-cuda128-torch28-py312-image
+  targets:
+  - select:
+      kind: ClusterTrainingRuntime
+      name: training-hub
+    fieldPaths:
+    - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image
+
 # Labels to add to all resources and selectors.
 labels:
 - includeSelectors: true

diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env
@@ -1 +1,3 @@
 odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0
+odh-training-cuda128-torch28-py312-image=quay.io/modh/training:py312-cuda128-torch280
+odh-training-rocm64-torch28-py312-image=quay.io/modh/training:py312-rocm64-torch280
diff --git a/manifests/rhoai/runtimes/kustomization.yaml b/manifests/rhoai/runtimes/kustomization.yaml
@@ -1,10 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
-  - torch_cuda_241.yaml
-  - torch_cuda_251.yaml
-  - torch_rocm_241.yaml
-  - torch_rocm_251.yaml
+  - torch_distributed_rocm.yaml
   - torch_distributed_th03_cuda128_torch28_py312.yaml
   - torch_distributed.yaml
   - training_hub_th03_cuda128_torch28_py312.yaml

diff --git a/manifests/rhoai/runtimes/torch_cuda_251.yaml b/manifests/rhoai/runtimes/torch_cuda_251.yaml
diff --git a/manifests/rhoai/runtimes/torch_distributed.yaml b/manifests/rhoai/runtimes/torch_distributed.yaml
@@ -22,4 +22,4 @@ spec:
                 spec:
                   containers:
                     - name: node
-                      image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
+                      image: quay.io/modh/training:py312-cuda128-torch280
diff --git a/manifests/rhoai/runtimes/torch_cuda_241.yaml → ...hoai/runtimes/torch_distributed_rocm.yaml b/manifests/rhoai/runtimes/torch_cuda_241.yaml → ...hoai/runtimes/torch_distributed_rocm.yaml
@@ -1,7 +1,7 @@
 apiVersion: trainer.kubeflow.org/v1alpha1
 kind: ClusterTrainingRuntime
 metadata:
-  name: torch-cuda-241
+  name: torch-distributed-rocm
   labels:
     trainer.kubeflow.org/framework: torch
 spec:
@@ -22,4 +22,4 @@ spec:
                 spec:
                   containers:
                     - name: node
-                      image: quay.io/modh/training:py311-cuda121-torch241
+                      image: quay.io/modh/training:py312-rocm64-torch280
diff --git a/manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml b/manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml
@@ -22,4 +22,4 @@ spec:
                 spec:
                   containers:
                     - name: node
-                      image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
+                      image: quay.io/modh/training:py312-cuda128-torch280
diff --git a/manifests/rhoai/runtimes/torch_rocm_241.yaml b/manifests/rhoai/runtimes/torch_rocm_241.yaml
diff --git a/manifests/rhoai/runtimes/torch_rocm_251.yaml b/manifests/rhoai/runtimes/torch_rocm_251.yaml
diff --git a/manifests/rhoai/runtimes/training_hub.yaml b/manifests/rhoai/runtimes/training_hub.yaml
@@ -22,4 +22,4 @@ spec:
                 spec:
                   containers:
                     - name: node
-                      image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
+                      image: quay.io/modh/training:py312-cuda128-torch280
diff --git a/manifests/rhoai/runtimes/training_hub_th03_cuda128_torch28_py312.yaml b/manifests/rhoai/runtimes/training_hub_th03_cuda128_torch28_py312.yaml
@@ -22,4 +22,4 @@ spec:
                 spec:
                   containers:
                     - name: node
-                      image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
+                      image: quay.io/modh/training:py312-cuda128-torch280