Skip to content

Commit 360da1e

Browse files
Synchronize trainer manifests v2.1.0 (#3282)
* Update kubeflow/trainer manifests from v2.1.0 Signed-off-by: juliusvonkohout <[email protected]> * update script Signed-off-by: juliusvonkohout <[email protected]> --------- Signed-off-by: juliusvonkohout <[email protected]>
1 parent aa0e7c2 commit 360da1e

33 files changed

+5013
-2451
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ This repository periodically synchronizes all official Kubeflow components from
5757
| Component | Local Manifests Path | Upstream Revision | CPU (millicores) | Memory (Mi) | PVC Storage (GB) |
5858
| - | - | - | - | - | - |
5959
| Training Operator | applications/training-operator/upstream | [v1.9.2](https://github.com/kubeflow/training-operator/tree/v1.9.2/manifests) | 3m | 25Mi | 0GB |
60-
| Trainer | applications/trainer/upstream | [f12a6d3](https://github.com/kubeflow/trainer/tree/f12a6d399a3dbb84d8829a5e7603ab310c45df6a/manifests) | 8m | 143Mi | 0GB |
60+
| Trainer | applications/trainer/upstream | [v2.1.0](https://github.com/kubeflow/trainer/tree/v2.1.0/manifests) | 8m | 143Mi | 0GB |
6161
| Notebook Controller | applications/jupyter/notebook-controller/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/notebook-controller/config) | 5m | 93Mi | 0GB |
6262
| PVC Viewer Controller | applications/pvcviewer-controller/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/pvcviewer-controller/config) | 15m | 128Mi | 0GB |
6363
| Tensorboard Controller | applications/tensorboard/tensorboard-controller/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/tensorboard-controller/config) | 15m | 128Mi | 0GB |

applications/trainer/upstream/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml

Lines changed: 525 additions & 93 deletions
Large diffs are not rendered by default.

applications/trainer/upstream/base/crds/trainer.kubeflow.org_trainingruntimes.yaml

Lines changed: 525 additions & 93 deletions
Large diffs are not rendered by default.

applications/trainer/upstream/base/crds/trainer.kubeflow.org_trainjobs.yaml

Lines changed: 3492 additions & 2190 deletions
Large diffs are not rendered by default.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
apiVersion: config.trainer.kubeflow.org/v1alpha1
2+
kind: Configuration
3+
# Health configuration
4+
health:
5+
healthProbeBindAddress: :8081
6+
readinessEndpointName: readyz
7+
livenessEndpointName: healthz
8+
9+
# Metrics configuration
10+
metrics:
11+
bindAddress: :8443
12+
secureServing: true
13+
14+
# Webhook configuration
15+
webhook:
16+
port: 9443
17+
host: ""
18+
19+
# Leader election configuration
20+
leaderElection:
21+
leaderElect: true
22+
resourceName: trainer.kubeflow.org
23+
resourceNamespace: ""
24+
leaseDuration: 15s
25+
renewDeadline: 10s
26+
retryPeriod: 2s
27+
28+
# Controller configuration
29+
controller:
30+
groupKindConcurrency:
31+
TrainJob.trainer.kubeflow.org: 5
32+
TrainingRuntime.trainer.kubeflow.org: 1
33+
ClusterTrainingRuntime.trainer.kubeflow.org: 1
34+
35+
# Certificate management configuration
36+
certManagement:
37+
enable: true
38+
webhookServiceName: kubeflow-trainer-controller-manager
39+
webhookSecretName: kubeflow-trainer-webhook-cert
40+
41+
# Client connection configuration
42+
clientConnection:
43+
qps: 50
44+
burst: 100
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,16 @@
11
resources:
22
- manager.yaml
3+
4+
# Disable hash suffix for predictable ConfigMap names
5+
generatorOptions:
6+
disableNameSuffixHash: true
7+
8+
# ConfigMap generator for controller manager configuration
9+
configMapGenerator:
10+
- name: kubeflow-trainer-config
11+
files:
12+
- controller_manager_config.yaml
13+
14+
# Patches to mount the config file
15+
patches:
16+
- path: manager_config_patch.yaml
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: kubeflow-trainer-controller-manager
5+
spec:
6+
template:
7+
spec:
8+
containers:
9+
- name: manager
10+
args:
11+
- --config=/controller_manager_config.yaml
12+
- --zap-log-level=2
13+
volumeMounts:
14+
- name: kubeflow-trainer-config
15+
mountPath: /controller_manager_config.yaml
16+
subPath: controller_manager_config.yaml
17+
readOnly: true
18+
volumes:
19+
- name: kubeflow-trainer-config
20+
configMap:
21+
name: kubeflow-trainer-config

applications/trainer/upstream/base/rbac/role.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ rules:
4242
- list
4343
- update
4444
- watch
45+
- apiGroups:
46+
- coordination.k8s.io
47+
resources:
48+
- leases
49+
verbs:
50+
- create
51+
- get
52+
- list
53+
- update
4554
- apiGroups:
4655
- jobset.x-k8s.io
4756
resources:
@@ -62,6 +71,7 @@ rules:
6271
- list
6372
- watch
6473
- apiGroups:
74+
- scheduling.volcano.sh
6575
- scheduling.x-k8s.io
6676
resources:
6777
- podgroups
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
resources:
5+
- torch_distributed_with_cache.yaml
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
apiVersion: trainer.kubeflow.org/v1alpha1
2+
kind: ClusterTrainingRuntime
3+
metadata:
4+
name: torch-distributed-with-cache
5+
labels:
6+
trainer.kubeflow.org/framework: torch
7+
spec:
8+
mlPolicy:
9+
numNodes: 1
10+
torch:
11+
numProcPerNode: auto
12+
template:
13+
spec:
14+
replicatedJobs:
15+
- name: dataset-initializer
16+
replicas: 1
17+
template:
18+
metadata:
19+
labels:
20+
trainer.kubeflow.org/trainjob-ancestor-step: dataset-initializer
21+
spec:
22+
template:
23+
spec:
24+
serviceAccountName: kubeflow-trainer-cache-initializer
25+
containers:
26+
- name: dataset-initializer
27+
image: ghcr.io/kubeflow/trainer/dataset-initializer
28+
env:
29+
- name: CACHE_IMAGE
30+
value: "ghcr.io/kubeflow/trainer/data-cache:v2.1.0"
31+
- name: TRAIN_JOB_NAME
32+
valueFrom:
33+
fieldRef:
34+
apiVersion: v1
35+
fieldPath: metadata.labels['jobset.sigs.k8s.io/jobset-name']
36+
- name: node
37+
dependsOn:
38+
- name: dataset-initializer
39+
status: Complete
40+
template:
41+
metadata:
42+
labels:
43+
trainer.kubeflow.org/trainjob-ancestor-step: trainer
44+
spec:
45+
template:
46+
spec:
47+
containers:
48+
- name: node
49+
image: pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime
50+
env:
51+
- name: TRAIN_JOB_NAME
52+
valueFrom:
53+
fieldRef:
54+
apiVersion: v1
55+
fieldPath: metadata.labels['jobset.sigs.k8s.io/jobset-name']

0 commit comments

Comments
 (0)