diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml index 2f10a2fc88..b997e50cc2 100644 --- a/manifests/rhoai/kustomization.yaml +++ b/manifests/rhoai/kustomization.yaml @@ -15,6 +15,7 @@ configurations: - params.yaml replacements: +# Replace controller image - source: kind: ConfigMap name: rhoai-config @@ -27,6 +28,71 @@ replacements: fieldPaths: - spec.template.spec.containers.0.image +# Replace image for torch-distributed-rocm +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-training-rocm64-torch28-py312-image + targets: + - select: + kind: ClusterTrainingRuntime + name: torch-distributed-rocm + fieldPaths: + - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image + +# Replace image for torch-distributed-th03-cuda128-torch28-py312 +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-training-cuda128-torch28-py312-image + targets: + - select: + kind: ClusterTrainingRuntime + name: torch-distributed-th03-cuda128-torch28-py312 + fieldPaths: + - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image + +# Replace image for torch-distributed +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-training-cuda128-torch28-py312-image + targets: + - select: + kind: ClusterTrainingRuntime + name: torch-distributed + fieldPaths: + - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image + +# Replace image for training-hub-th03-cuda128-torch28-py312 +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-training-cuda128-torch28-py312-image + targets: + - select: + kind: ClusterTrainingRuntime + name: training-hub03-cuda128-torch28-py312 + fieldPaths: + - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image + +# Replace image for training-hub +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-training-cuda128-torch28-py312-image + targets: + - select: + kind: ClusterTrainingRuntime + name: training-hub + fieldPaths: + - spec.template.spec.replicatedJobs.[name=node].template.spec.template.spec.containers.[name=node].image + # Labels to add to all resources and selectors. labels: - includeSelectors: true diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env index be4de6d00b..b9d40322c3 100644 --- a/manifests/rhoai/params.env +++ b/manifests/rhoai/params.env @@ -1 +1,3 @@ odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0 +odh-training-cuda128-torch28-py312-image=quay.io/modh/training:py312-cuda128-torch280 +odh-training-rocm64-torch28-py312-image=quay.io/modh/training:py312-rocm64-torch280 diff --git a/manifests/rhoai/runtimes/kustomization.yaml b/manifests/rhoai/runtimes/kustomization.yaml index a0436ea022..324ff8b267 100644 --- a/manifests/rhoai/runtimes/kustomization.yaml +++ b/manifests/rhoai/runtimes/kustomization.yaml @@ -1,10 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - torch_cuda_241.yaml - - torch_cuda_251.yaml - - torch_rocm_241.yaml - - torch_rocm_251.yaml + - torch_distributed_rocm.yaml - torch_distributed_th03_cuda128_torch28_py312.yaml - torch_distributed.yaml - training_hub_th03_cuda128_torch28_py312.yaml diff --git a/manifests/rhoai/runtimes/torch_cuda_251.yaml b/manifests/rhoai/runtimes/torch_cuda_251.yaml deleted file mode 100644 index 173d18946a..0000000000 --- a/manifests/rhoai/runtimes/torch_cuda_251.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: trainer.kubeflow.org/v1alpha1 -kind: ClusterTrainingRuntime -metadata: - name: torch-cuda-251 - labels: - trainer.kubeflow.org/framework: torch -spec: - mlPolicy: - numNodes: 1 - torch: - numProcPerNode: auto - template: - spec: - replicatedJobs: - - name: node - template: - metadata: - labels: - trainer.kubeflow.org/trainjob-ancestor-step: trainer - spec: - template: - spec: - containers: - - name: node - image: quay.io/modh/training:py311-cuda124-torch251 diff --git a/manifests/rhoai/runtimes/torch_distributed.yaml b/manifests/rhoai/runtimes/torch_distributed.yaml index 81a6011e83..5da2a04b14 100644 --- a/manifests/rhoai/runtimes/torch_distributed.yaml +++ b/manifests/rhoai/runtimes/torch_distributed.yaml @@ -22,4 +22,4 @@ spec: spec: containers: - name: node - image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest + image: quay.io/modh/training:py312-cuda128-torch280 diff --git a/manifests/rhoai/runtimes/torch_cuda_241.yaml b/manifests/rhoai/runtimes/torch_distributed_rocm.yaml similarity index 83% rename from manifests/rhoai/runtimes/torch_cuda_241.yaml rename to manifests/rhoai/runtimes/torch_distributed_rocm.yaml index b0ce984264..82d500d54a 100644 --- a/manifests/rhoai/runtimes/torch_cuda_241.yaml +++ b/manifests/rhoai/runtimes/torch_distributed_rocm.yaml @@ -1,7 +1,7 @@ apiVersion: trainer.kubeflow.org/v1alpha1 kind: ClusterTrainingRuntime metadata: - name: torch-cuda-241 + name: torch-distributed-rocm labels: trainer.kubeflow.org/framework: torch spec: @@ -22,4 +22,4 @@ spec: spec: containers: - name: node - image: quay.io/modh/training:py311-cuda121-torch241 + image: quay.io/modh/training:py312-rocm64-torch280 diff --git a/manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml b/manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml index 28a763a82e..c4ece59639 100644 --- a/manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml +++ b/manifests/rhoai/runtimes/torch_distributed_th03_cuda128_torch28_py312.yaml @@ -22,4 +22,4 @@ spec: spec: containers: - name: node - image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest + image: quay.io/modh/training:py312-cuda128-torch280 diff --git a/manifests/rhoai/runtimes/torch_rocm_241.yaml b/manifests/rhoai/runtimes/torch_rocm_241.yaml deleted file mode 100644 index 9a8c779b4a..0000000000 --- a/manifests/rhoai/runtimes/torch_rocm_241.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: trainer.kubeflow.org/v1alpha1 -kind: ClusterTrainingRuntime -metadata: - name: torch-rocm-241 - labels: - trainer.kubeflow.org/framework: torch -spec: - mlPolicy: - numNodes: 1 - torch: - numProcPerNode: auto - template: - spec: - replicatedJobs: - - name: node - template: - metadata: - labels: - trainer.kubeflow.org/trainjob-ancestor-step: trainer - spec: - template: - spec: - containers: - - name: node - image: quay.io/modh/training:py311-rocm62-torch241 diff --git a/manifests/rhoai/runtimes/torch_rocm_251.yaml b/manifests/rhoai/runtimes/torch_rocm_251.yaml deleted file mode 100644 index 925a8a12fb..0000000000 --- a/manifests/rhoai/runtimes/torch_rocm_251.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: trainer.kubeflow.org/v1alpha1 -kind: ClusterTrainingRuntime -metadata: - name: torch-rocm-251 - labels: - trainer.kubeflow.org/framework: torch -spec: - mlPolicy: - numNodes: 1 - torch: - numProcPerNode: auto - template: - spec: - replicatedJobs: - - name: node - template: - metadata: - labels: - trainer.kubeflow.org/trainjob-ancestor-step: trainer - spec: - template: - spec: - containers: - - name: node - image: quay.io/modh/training:py311-rocm62-torch251 diff --git a/manifests/rhoai/runtimes/training_hub.yaml b/manifests/rhoai/runtimes/training_hub.yaml index 297011a580..a364ff5c2e 100644 --- a/manifests/rhoai/runtimes/training_hub.yaml +++ b/manifests/rhoai/runtimes/training_hub.yaml @@ -22,4 +22,4 @@ spec: spec: containers: - name: node - image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest + image: quay.io/modh/training:py312-cuda128-torch280 diff --git a/manifests/rhoai/runtimes/training_hub_th03_cuda128_torch28_py312.yaml b/manifests/rhoai/runtimes/training_hub_th03_cuda128_torch28_py312.yaml index da236aabac..be20dfdceb 100644 --- a/manifests/rhoai/runtimes/training_hub_th03_cuda128_torch28_py312.yaml +++ b/manifests/rhoai/runtimes/training_hub_th03_cuda128_torch28_py312.yaml @@ -22,4 +22,4 @@ spec: spec: containers: - name: node - image: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest + image: quay.io/modh/training:py312-cuda128-torch280