File tree Expand file tree Collapse file tree 6 files changed +178
-111
lines changed
Expand file tree Collapse file tree 6 files changed +178
-111
lines changed Load Diff This file was deleted.
Load Diff This file was deleted.
Original file line number Diff line number Diff line change 1+ resources :
2+ - v1alpha1/cron/cron-pytorch.yaml
3+ - v1alpha1/cron/cron-tf.yaml
4+ - v1alpha1/cron/cron-mpi.yaml
Original file line number Diff line number Diff line change 1+ apiVersion : apps.kubedl.io/v1alpha1
2+ kind : Cron
3+ metadata :
4+ name : cron-mpi
5+ labels :
6+ app.kubernetes.io/name : cron-operator
7+ app.kubernetes.io/managed-by : kustomize
8+ spec :
9+ schedule : " */1 * * * *"
10+ concurrencyPolicy : Forbid
11+ historyLimit : 3
12+ template :
13+ workload :
14+ apiVersion : kubeflow.org/v1alpha1
15+ kind : MPIJob
16+ metadata :
17+ labels :
18+ key1 : value1
19+ key2 : value2
20+ annotations :
21+ key1 : value1
22+ key2 : value2
23+ spec :
24+ mpiReplicaSpecs :
25+ Launcher :
26+ replicas : 1
27+ template :
28+ spec :
29+ containers :
30+ - name : mpi
31+ image : kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
32+ imagePullPolicy : IfNotPresent
33+ command :
34+ - /bin/bash
35+ - -c
36+ args :
37+ - |
38+ echo "Launcher: Start training..."
39+ sleep 30
40+ echo "Launcher: Training completed."
41+ Worker :
42+ replicas : 2
43+ restartPolicy : Never
44+ template :
45+ spec :
46+ containers :
47+ - name : mpi
48+ image : kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
49+ imagePullPolicy : IfNotPresent
50+ command :
51+ - /bin/bash
52+ - -c
53+ args :
54+ - |
55+ echo "Worker: Start training..."
56+ sleep 30
57+ echo "Worker: Training completed."
Original file line number Diff line number Diff line change 1+ apiVersion : apps.kubedl.io/v1alpha1
2+ kind : Cron
3+ metadata :
4+ name : cron-pytorch
5+ labels :
6+ app.kubernetes.io/name : cron-operator
7+ app.kubernetes.io/managed-by : kustomize
8+ spec :
9+ schedule : " */1 * * * *"
10+ concurrencyPolicy : Forbid
11+ historyLimit : 3
12+ template :
13+ workload :
14+ apiVersion : kubeflow.org/v1
15+ kind : PyTorchJob
16+ metadata :
17+ labels :
18+ key1 : value1
19+ key2 : value2
20+ annotations :
21+ key1 : value1
22+ key2 : value2
23+ spec :
24+ pytorchReplicaSpecs :
25+ Master :
26+ replicas : 1
27+ restartPolicy : Never
28+ template :
29+ spec :
30+ containers :
31+ - name : pytorch
32+ image : kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
33+ imagePullPolicy : IfNotPresent
34+ command :
35+ - /bin/bash
36+ - -c
37+ args :
38+ - |
39+ echo "Master: Start training..."
40+ sleep 30
41+ echo "Master: Training completed."
42+ Worker :
43+ replicas : 2
44+ restartPolicy : Never
45+ template :
46+ spec :
47+ containers :
48+ - name : pytorch
49+ image : kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
50+ imagePullPolicy : IfNotPresent
51+ command :
52+ - /bin/bash
53+ - -c
54+ args :
55+ - |
56+ echo "Worker: Start training..."
57+ sleep 30
58+ echo "Worker: Training completed."
Original file line number Diff line number Diff line change 1+ apiVersion : apps.kubedl.io/v1alpha1
2+ kind : Cron
3+ metadata :
4+ name : cron-tf
5+ labels :
6+ app.kubernetes.io/name : cron-operator
7+ app.kubernetes.io/managed-by : kustomize
8+ spec :
9+ schedule : " */1 * * * *"
10+ concurrencyPolicy : Forbid
11+ historyLimit : 3
12+ template :
13+ workload :
14+ apiVersion : kubeflow.org/v1
15+ kind : TFJob
16+ metadata :
17+ labels :
18+ key1 : value1
19+ key2 : value2
20+ annotations :
21+ key1 : value1
22+ key2 : value2
23+ spec :
24+ cleanPodPolicy : None
25+ tfReplicaSpecs :
26+ PS :
27+ replicas : 1
28+ restartPolicy : Never
29+ template :
30+ spec :
31+ containers :
32+ - name : tensorflow
33+ image : kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/tensorflow-mnist-example:2.15.0-gpu
34+ imagePullPolicy : IfNotPresent
35+ command :
36+ - /bin/bash
37+ - -c
38+ args :
39+ - |
40+ echo "PS: Start training..."
41+ sleep 30
42+ echo "PS: Training completed."
43+ Worker :
44+ replicas : 2
45+ restartPolicy : Never
46+ template :
47+ spec :
48+ containers :
49+ - name : tensorflow
50+ image : kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/tensorflow-mnist-example:2.15.0-gpu
51+ imagePullPolicy : IfNotPresent
52+ command :
53+ - /bin/bash
54+ - -c
55+ args :
56+ - |
57+ echo "Worker: Start training..."
58+ sleep 30
59+ echo "Worker: Training completed."
You can’t perform that action at this time.
0 commit comments