kubeflow · khushiiagrawal · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/charts/kubeflow-trainer/README.md b/charts/kubeflow-trainer/README.md
@@ -31,6 +31,57 @@ Alternatively, you can install the latest version from the master branch (e.g. `
 helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer --version 0.0.0-sha-bfccb7b
 ```
 
+### Install with ClusterTrainingRuntimes
+
+You can optionally deploy ClusterTrainingRuntimes as part of the Helm installation. Runtimes are disabled by default to keep the chart lightweight.
+
+To enable specific runtimes:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  --set runtimes.torchDistributed.enabled=true \
+  --set runtimes.deepspeedDistributed.enabled=true
+```
+
+Or use a custom values file:
+
+```yaml
+# values.yaml
+runtimes:
+  torchDistributed:
+    enabled: true
+  torchDistributedWithCache:
+    enabled: true
+    dataCache:
+      enabled: true
+      cacheImage:
+        tag: "v2.0.0"
+  deepspeedDistributed:
+    enabled: true
+  mlxDistributed:
+    enabled: true
+
+# Required for torch-distributed-with-cache
+dataCache:
+  enabled: true
+```
+
+Then install with:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  -f values.yaml
+```
+
+### Available Runtimes
+
+- **torch-distributed**: PyTorch distributed training (no custom images)
+- **torch-distributed-with-cache**: PyTorch with distributed data cache support (requires `dataCache.enabled=true`)
+- **deepspeed-distributed**: DeepSpeed distributed training with MPI
+- **mlx-distributed**: MLX distributed training with MPI
+
 ### Uninstall the chart
 
 ```shell
@@ -72,6 +123,35 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum
 | dataCache.enabled | bool | `false` | Enable/disable data cache support (LWS dependency, ClusterRole). Set to `true` to install data cache components. |
 | dataCache.lws.install | bool | `true` | Whether to install LeaderWorkerSet as a dependency. Set to `false` if LeaderWorkerSet is already installed in the cluster. |
 | dataCache.lws.fullnameOverride | string | `"lws"` | String to fully override LeaderWorkerSet release name. |
+| runtimes | object | `{"deepspeedDistributed":{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/deepspeed-runtime","tag":""}},"defaultEnabled":false,"mlxDistributed":{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/mlx-runtime","tag":""}},"torchDistributed":{"enabled":false},"torchDistributedWithCache":{"dataCache":{"cacheImage":{"registry":"ghcr.io","repository":"kubeflow/trainer/data-cache","tag":""},"enabled":true},"enabled":false},"torchtuneDistributed":{"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/torchtune-trainer","tag":""},"llama3_2_1B":{"enabled":false},"llama3_2_3B":{"enabled":false},"qwen2_5_1_5B":{"enabled":false}}}` | ClusterTrainingRuntimes configuration These are optional runtime templates that can be deployed with the Helm chart. Each runtime provides a blueprint for different ML frameworks and configurations. |
+| runtimes.defaultEnabled | bool | `false` | Enable all default runtimes (torch, deepspeed, mlx, torchtune) when set to true. Individual runtime settings will be ignored if this is enabled. |
+| runtimes.torchDistributed | object | `{"enabled":false}` | PyTorch distributed training runtime (no custom images required) |
+| runtimes.torchDistributed.enabled | bool | `false` | Enable deployment of torch-distributed runtime |
+| runtimes.torchDistributedWithCache | object | `{"dataCache":{"cacheImage":{"registry":"ghcr.io","repository":"kubeflow/trainer/data-cache","tag":""},"enabled":true},"enabled":false}` | PyTorch distributed training with data cache support |
+| runtimes.torchDistributedWithCache.enabled | bool | `false` | Enable deployment of torch-distributed-with-cache runtime |
+| runtimes.torchDistributedWithCache.dataCache.cacheImage.registry | string | `"ghcr.io"` | Data cache image registry |
+| runtimes.torchDistributedWithCache.dataCache.cacheImage.repository | string | `"kubeflow/trainer/data-cache"` | Data cache image repository |
+| runtimes.torchDistributedWithCache.dataCache.cacheImage.tag | string | `""` | Data cache image tag. Defaults to chart version if empty. |
+| runtimes.deepspeedDistributed | object | `{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/deepspeed-runtime","tag":""}}` | DeepSpeed distributed training runtime |
+| runtimes.deepspeedDistributed.enabled | bool | `false` | Enable deployment of deepspeed-distributed runtime |
+| runtimes.deepspeedDistributed.image.registry | string | `"ghcr.io"` | DeepSpeed runtime image registry |
+| runtimes.deepspeedDistributed.image.repository | string | `"kubeflow/trainer/deepspeed-runtime"` | DeepSpeed runtime image repository |
+| runtimes.deepspeedDistributed.image.tag | string | `""` | DeepSpeed runtime image tag. Defaults to chart version if empty. |
+| runtimes.mlxDistributed | object | `{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/mlx-runtime","tag":""}}` | MLX distributed training runtime |
+| runtimes.mlxDistributed.enabled | bool | `false` | Enable deployment of mlx-distributed runtime |
+| runtimes.mlxDistributed.image.registry | string | `"ghcr.io"` | MLX runtime image registry |
+| runtimes.mlxDistributed.image.repository | string | `"kubeflow/trainer/mlx-runtime"` | MLX runtime image repository |
+| runtimes.mlxDistributed.image.tag | string | `""` | MLX runtime image tag. Defaults to chart version if empty. |
+| runtimes.torchtuneDistributed | object | `{"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/torchtune-trainer","tag":""},"llama3_2_1B":{"enabled":false},"llama3_2_3B":{"enabled":false},"qwen2_5_1_5B":{"enabled":false}}` | TorchTune distributed training runtime |
+| runtimes.torchtuneDistributed.image.registry | string | `"ghcr.io"` | TorchTune runtime image registry |
+| runtimes.torchtuneDistributed.image.repository | string | `"kubeflow/trainer/torchtune-trainer"` | TorchTune runtime image repository |
+| runtimes.torchtuneDistributed.image.tag | string | `""` | TorchTune runtime image tag. Defaults to chart version if empty. |
+| runtimes.torchtuneDistributed.llama3_2_1B | object | `{"enabled":false}` | Llama 3.2 1B model configuration |
+| runtimes.torchtuneDistributed.llama3_2_1B.enabled | bool | `false` | Enable deployment of Llama 3.2 1B runtime |
+| runtimes.torchtuneDistributed.llama3_2_3B | object | `{"enabled":false}` | Llama 3.2 3B model configuration |
+| runtimes.torchtuneDistributed.llama3_2_3B.enabled | bool | `false` | Enable deployment of Llama 3.2 3B runtime |
+| runtimes.torchtuneDistributed.qwen2_5_1_5B | object | `{"enabled":false}` | Qwen 2.5 1.5B model configuration |
+| runtimes.torchtuneDistributed.qwen2_5_1_5B.enabled | bool | `false` | Enable deployment of Qwen 2.5 1.5B runtime |
 
 ## Maintainers
 

diff --git a/charts/kubeflow-trainer/README.md.gotmpl b/charts/kubeflow-trainer/README.md.gotmpl
@@ -1,5 +1,5 @@
 {{- /*
-Copyright 2025 The Kubeflow authors.
+Copyright 2026 The Kubeflow authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -49,6 +49,57 @@ Alternatively, you can install the latest version from the master branch (e.g. `
 helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer --version 0.0.0-sha-bfccb7b
 ```
 
+### Install with ClusterTrainingRuntimes
+
+You can optionally deploy ClusterTrainingRuntimes as part of the Helm installation. Runtimes are disabled by default to keep the chart lightweight.
+
+To enable specific runtimes:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  --set runtimes.torchDistributed.enabled=true \
+  --set runtimes.deepspeedDistributed.enabled=true
+```
+
+Or use a custom values file:
+
+```yaml
+# values.yaml
+runtimes:
+  torchDistributed:
+    enabled: true
+  torchDistributedWithCache:
+    enabled: true
+    dataCache:
+      enabled: true
+      cacheImage:
+        tag: "v2.0.0"
+  deepspeedDistributed:
+    enabled: true
+  mlxDistributed:
+    enabled: true
+
+# Required for torch-distributed-with-cache
+dataCache:
+  enabled: true
+```
+
+Then install with:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  -f values.yaml
+```
+
+### Available Runtimes
+
+- **torch-distributed**: PyTorch distributed training (no custom images)
+- **torch-distributed-with-cache**: PyTorch with distributed data cache support (requires `dataCache.enabled=true`)
+- **deepspeed-distributed**: DeepSpeed distributed training with MPI
+- **mlx-distributed**: MLX distributed training with MPI
+
 ### Uninstall the chart
 
 ```shell

diff --git a/charts/kubeflow-trainer/examples/runtimes-values.yaml b/charts/kubeflow-trainer/examples/runtimes-values.yaml
@@ -0,0 +1,35 @@
+# Example values.yaml configuration for deploying ClusterTrainingRuntimes
+
+# Deploy torch-distributed runtime (no custom images needed)
+runtimes:
+  torchDistributed:
+    enabled: true
+
+  # Deploy torch-distributed with data cache support
+  torchDistributedWithCache:
+    enabled: true
+    # cacheImage will use chart version by default
+    # To override, specify custom tag:
+    # cacheImage:
+    #   tag: "v2.0.0"
+
+  # Deploy DeepSpeed runtime
+  deepspeedDistributed:
+    enabled: true
+    # Override image tag if needed:
+    # image:
+    #   tag: "custom-v1.0.0"
+
+  # Deploy MLX runtime
+  mlxDistributed:
+    enabled: false
+    # Can enable and customize:
+    # enabled: true
+    # image:
+    #   registry: my-registry.io
+    #   repository: custom/mlx-runtime
+    #   tag: "v1.0.0"
+
+# Note: torch-distributed-with-cache requires data cache support
+dataCache:
+  enabled: true
diff --git a/charts/kubeflow-trainer/templates/_helpers.tpl b/charts/kubeflow-trainer/templates/_helpers.tpl
@@ -64,24 +64,61 @@ app.kubernetes.io/name: {{ include "trainer.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 
+{{/*
+Resolve the effective image tag, using a provided tag if present or
+falling back to the default image tag derived from the chart version.
+Usage: include "trainer.resolveImageTag" (dict "tag" .Values.image.tag "context" .)
+*/}}
+{{- define "trainer.resolveImageTag" -}}
+{{- if .tag }}
+{{- .tag -}}
+{{- else -}}
+{{- include "trainer.defaultImageTag" .context -}}
+{{- end -}}
+{{- end }}
+
 {{- define "trainer.image" -}}
 {{- $imageRegistry := .Values.image.registry | default "docker.io" }}
 {{- $imageRepository := .Values.image.repository }}
-{{- $imageTag := .Values.image.tag -}}
-{{- if not $imageTag -}}
-{{- if hasPrefix "0.0.0-" .Chart.Version -}}
-{{- $imageTag = trimPrefix "0.0.0-" .Chart.Version -}}
-{{- else -}}
-{{- $imageTag = printf "v%s" .Chart.Version -}}
-{{- end -}}
-{{- end -}}
+{{- $imageTag := include "trainer.resolveImageTag" (dict "tag" .Values.image.tag "context" .) -}}
 {{- if eq $imageRegistry "docker.io" }}
 {{- printf "%s:%s" $imageRepository $imageTag }}
 {{- else }}
 {{- printf "%s/%s:%s" $imageRegistry $imageRepository $imageTag }}
 {{- end }}
 {{- end }}
 
+{{/*
+Generate the default image tag for runtimes based on chart version
+*/}}
+{{- define "trainer.defaultImageTag" -}}
+{{- if hasPrefix "0.0.0-" .Chart.Version -}}
+{{- trimPrefix "0.0.0-" .Chart.Version -}}
+{{- else -}}
+{{- printf "v%s" .Chart.Version -}}
+{{- end -}}
+{{- end }}
+
+{{/*
+Generate runtime image with registry, repository, and tag from values
+Usage: include "trainer.runtimeImage" (list .Values.runtimes.deepspeedDistributed.image .)
+*/}}
+{{- define "trainer.runtimeImage" -}}
+{{- $imageConfig := index . 0 }}
+{{- $root := index . 1 }}
+{{- $registry := $imageConfig.registry | default "ghcr.io" }}
+{{- $repository := $imageConfig.repository }}
+{{- $tag := include "trainer.resolveImageTag" (dict "tag" ($imageConfig.tag) "context" $root) -}}
+{{- if eq $registry "docker.io" }}
+{{- printf "%s:%s" $repository $tag }}
+{{- else }}
+{{- printf "%s/%s:%s" $registry $repository $tag }}
+{{- end }}
+{{- end }}
+{{/*
+Return the version of the trainer.
+If the version is 0.0.0, we assume it is a development version.
+*/}}
 {{- define "trainer.version" -}}
 {{- if hasPrefix "0.0.0-" .Chart.Version -}}
 dev

diff --git a/charts/kubeflow-trainer/templates/runtimes/deepspeed-distributed.yaml b/charts/kubeflow-trainer/templates/runtimes/deepspeed-distributed.yaml
@@ -0,0 +1,75 @@
+{{- /*
+Copyright 2025 The Kubeflow authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ -}}
+
+{{- if or .Values.runtimes.deepspeedDistributed.enabled .Values.runtimes.defaultEnabled }}
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: deepspeed-distributed
+  labels:
+    trainer.kubeflow.org/framework: deepspeed
+    {{- include "trainer.labels" . | nindent 4 }}
+spec:
+  mlPolicy:
+    numNodes: 1
+    mpi:
+      numProcPerNode: 1
+      mpiImplementation: OpenMPI
+      sshAuthMountPath: /home/mpiuser/.ssh
+      runLauncherAsNode: true
+  template:
+    spec:
+      network:
+        publishNotReadyAddresses: true
+      successPolicy:
+        operator: All
+        targetReplicatedJobs:
+          - launcher
+      replicatedJobs:
+        - name: launcher
+          template:
+            metadata:
+              labels:
+                trainer.kubeflow.org/trainjob-ancestor-step: trainer
+            spec:
+              template:
+                spec:
+                  containers:
+                    - name: node
+                      image: {{ include "trainer.runtimeImage" (list .Values.runtimes.deepspeedDistributed.image .) }}
+                      securityContext:
+                        runAsUser: 1000
+        - name: node
+          template:
+            spec:
+              template:
+                spec:
+                  containers:
+                    - name: node
+                      image: {{ include "trainer.runtimeImage" (list .Values.runtimes.deepspeedDistributed.image .) }}
+                      securityContext:
+                        runAsUser: 1000
+                      command:
+                        - /usr/sbin/sshd
+                      args:
+                        - -De
+                        - -f
+                        - /home/mpiuser/.sshd_config
+                      readinessProbe:
+                        tcpSocket:
+                          port: 2222
+                        initialDelaySeconds: 5
+{{- end }}