Skip to content

Commit 3d3dab6

Browse files
authored
feat(example): update pytorch/tensorflow/mpi cron examples (#65)
Signed-off-by: Yi Chen <github@chenyicn.net>
1 parent fb16b93 commit 3d3dab6

File tree

6 files changed

+178
-111
lines changed

6 files changed

+178
-111
lines changed

config/samples/kustomization.yaml

Lines changed: 0 additions & 4 deletions
This file was deleted.

config/samples/v1alpha1_cron.yaml

Lines changed: 0 additions & 107 deletions
This file was deleted.

examples/kustomization.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
resources:
2+
- v1alpha1/cron/cron-pytorch.yaml
3+
- v1alpha1/cron/cron-tf.yaml
4+
- v1alpha1/cron/cron-mpi.yaml
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
apiVersion: apps.kubedl.io/v1alpha1
2+
kind: Cron
3+
metadata:
4+
name: cron-mpi
5+
labels:
6+
app.kubernetes.io/name: cron-operator
7+
app.kubernetes.io/managed-by: kustomize
8+
spec:
9+
schedule: "*/1 * * * *"
10+
concurrencyPolicy: Forbid
11+
historyLimit: 3
12+
template:
13+
workload:
14+
apiVersion: kubeflow.org/v1alpha1
15+
kind: MPIJob
16+
metadata:
17+
labels:
18+
key1: value1
19+
key2: value2
20+
annotations:
21+
key1: value1
22+
key2: value2
23+
spec:
24+
mpiReplicaSpecs:
25+
Launcher:
26+
replicas: 1
27+
template:
28+
spec:
29+
containers:
30+
- name: mpi
31+
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
32+
imagePullPolicy: IfNotPresent
33+
command:
34+
- /bin/bash
35+
- -c
36+
args:
37+
- |
38+
echo "Launcher: Start training..."
39+
sleep 30
40+
echo "Launcher: Training completed."
41+
Worker:
42+
replicas: 2
43+
restartPolicy: Never
44+
template:
45+
spec:
46+
containers:
47+
- name: mpi
48+
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
49+
imagePullPolicy: IfNotPresent
50+
command:
51+
- /bin/bash
52+
- -c
53+
args:
54+
- |
55+
echo "Worker: Start training..."
56+
sleep 30
57+
echo "Worker: Training completed."
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
apiVersion: apps.kubedl.io/v1alpha1
2+
kind: Cron
3+
metadata:
4+
name: cron-pytorch
5+
labels:
6+
app.kubernetes.io/name: cron-operator
7+
app.kubernetes.io/managed-by: kustomize
8+
spec:
9+
schedule: "*/1 * * * *"
10+
concurrencyPolicy: Forbid
11+
historyLimit: 3
12+
template:
13+
workload:
14+
apiVersion: kubeflow.org/v1
15+
kind: PyTorchJob
16+
metadata:
17+
labels:
18+
key1: value1
19+
key2: value2
20+
annotations:
21+
key1: value1
22+
key2: value2
23+
spec:
24+
pytorchReplicaSpecs:
25+
Master:
26+
replicas: 1
27+
restartPolicy: Never
28+
template:
29+
spec:
30+
containers:
31+
- name: pytorch
32+
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
33+
imagePullPolicy: IfNotPresent
34+
command:
35+
- /bin/bash
36+
- -c
37+
args:
38+
- |
39+
echo "Master: Start training..."
40+
sleep 30
41+
echo "Master: Training completed."
42+
Worker:
43+
replicas: 2
44+
restartPolicy: Never
45+
template:
46+
spec:
47+
containers:
48+
- name: pytorch
49+
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/pytorch-mnist-example:2.5.1-cuda12.4-cudnn9-runtime
50+
imagePullPolicy: IfNotPresent
51+
command:
52+
- /bin/bash
53+
- -c
54+
args:
55+
- |
56+
echo "Worker: Start training..."
57+
sleep 30
58+
echo "Worker: Training completed."
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
apiVersion: apps.kubedl.io/v1alpha1
2+
kind: Cron
3+
metadata:
4+
name: cron-tf
5+
labels:
6+
app.kubernetes.io/name: cron-operator
7+
app.kubernetes.io/managed-by: kustomize
8+
spec:
9+
schedule: "*/1 * * * *"
10+
concurrencyPolicy: Forbid
11+
historyLimit: 3
12+
template:
13+
workload:
14+
apiVersion: kubeflow.org/v1
15+
kind: TFJob
16+
metadata:
17+
labels:
18+
key1: value1
19+
key2: value2
20+
annotations:
21+
key1: value1
22+
key2: value2
23+
spec:
24+
cleanPodPolicy: None
25+
tfReplicaSpecs:
26+
PS:
27+
replicas: 1
28+
restartPolicy: Never
29+
template:
30+
spec:
31+
containers:
32+
- name: tensorflow
33+
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/tensorflow-mnist-example:2.15.0-gpu
34+
imagePullPolicy: IfNotPresent
35+
command:
36+
- /bin/bash
37+
- -c
38+
args:
39+
- |
40+
echo "PS: Start training..."
41+
sleep 30
42+
echo "PS: Training completed."
43+
Worker:
44+
replicas: 2
45+
restartPolicy: Never
46+
template:
47+
spec:
48+
containers:
49+
- name: tensorflow
50+
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/tensorflow-mnist-example:2.15.0-gpu
51+
imagePullPolicy: IfNotPresent
52+
command:
53+
- /bin/bash
54+
- -c
55+
args:
56+
- |
57+
echo "Worker: Start training..."
58+
sleep 30
59+
echo "Worker: Training completed."

0 commit comments

Comments
 (0)