Skip to content

Commit 09204ae

Browse files
Add Kubeflow Trainer V2 demo
1 parent 0235a63 commit 09204ae

File tree

9 files changed

+2699
-0
lines changed

9 files changed

+2699
-0
lines changed
Binary file not shown.

examples/kft-v2/docs/jobs.png

182 KB
Loading
91 KB
Loading
106 KB
Loading
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
apiVersion: trainer.kubeflow.org/v1alpha1
2+
kind: ClusterTrainingRuntime
3+
metadata:
4+
name: pytorch-cpu
5+
spec:
6+
mlPolicy:
7+
numNodes: 2
8+
torch:
9+
numProcPerNode: 1
10+
template:
11+
metadata: {}
12+
spec:
13+
replicatedJobs:
14+
- name: dataset-initializer
15+
replicas: 1
16+
template:
17+
metadata:
18+
labels:
19+
trainer.kubeflow.org/trainjob-ancestor-step: dataset-initializer
20+
spec:
21+
template:
22+
spec:
23+
containers:
24+
- env:
25+
- name: HF_HOME
26+
value: /workspace/cache
27+
- name: DATASET_NAME
28+
value: tatsu-lab/alpaca
29+
- name: DATASET_CONFIG
30+
value: main
31+
- name: DATASET_SPLIT
32+
value: 'train[:500]'
33+
- name: DATASET_FORMAT
34+
value: json
35+
image: 'ghcr.io/kubeflow/trainer/dataset-initializer:v2.0.0'
36+
name: dataset-initializer
37+
resources:
38+
limits:
39+
cpu: '2'
40+
memory: 4Gi
41+
requests:
42+
cpu: '1'
43+
memory: 2Gi
44+
volumeMounts:
45+
- mountPath: /workspace
46+
name: shared-workspace
47+
restartPolicy: Never
48+
volumes:
49+
- name: shared-workspace
50+
persistentVolumeClaim:
51+
claimName: shared-checkpoint-storage
52+
- dependsOn:
53+
- name: dataset-initializer
54+
status: Complete
55+
name: model-initializer
56+
replicas: 1
57+
template:
58+
metadata:
59+
labels:
60+
trainer.kubeflow.org/trainjob-ancestor-step: model-initializer
61+
spec:
62+
template:
63+
spec:
64+
containers:
65+
- env:
66+
- name: HF_HOME
67+
value: /workspace/cache
68+
- name: MODEL_NAME
69+
value: gpt2
70+
- name: MODEL_REVISION
71+
value: main
72+
- name: DOWNLOAD_MODE
73+
value: force_redownload
74+
image: 'ghcr.io/kubeflow/trainer/model-initializer:v2.0.0'
75+
name: model-initializer
76+
resources:
77+
limits:
78+
cpu: '2'
79+
memory: 4Gi
80+
requests:
81+
cpu: '1'
82+
memory: 2Gi
83+
volumeMounts:
84+
- mountPath: /workspace
85+
name: shared-workspace
86+
restartPolicy: Never
87+
volumes:
88+
- name: shared-workspace
89+
persistentVolumeClaim:
90+
claimName: shared-checkpoint-storage
91+
- dependsOn:
92+
- name: model-initializer
93+
status: Complete
94+
name: node
95+
replicas: 1
96+
template:
97+
metadata:
98+
labels:
99+
trainer.kubeflow.org/trainjob-ancestor-step: trainer
100+
spec:
101+
template:
102+
metadata: {}
103+
spec:
104+
containers:
105+
- env:
106+
- name: PYTHONUNBUFFERED
107+
value: '1'
108+
- name: NCCL_DEBUG
109+
value: INFO
110+
- name: NCCL_SOCKET_IFNAME
111+
value: eth0
112+
- name: NCCL_IB_DISABLE
113+
value: '1'
114+
- name: NCCL_P2P_DISABLE
115+
value: '1'
116+
- name: TRAINJOB_PROGRESSION_FILE_PATH
117+
value: /tmp/training_progression.json
118+
- name: CHECKPOINT_DIR
119+
value: /workspace/checkpoints
120+
image: 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
121+
name: node
122+
resources:
123+
limits:
124+
cpu: '2'
125+
memory: 4Gi
126+
requests:
127+
cpu: '1'
128+
memory: 2Gi
129+
volumeMounts:
130+
- mountPath: /workspace
131+
name: shared-workspace
132+
volumes:
133+
- name: shared-workspace
134+
persistentVolumeClaim:
135+
claimName: fashion-mnist-storage
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: shared-checkpoint-storage
5+
spec:
6+
accessModes:
7+
- ReadWriteMany
8+
resources:
9+
requests:
10+
storage: 50Gi
11+
storageClassName: nfs-csi
12+
volumeMode: Filesystem

0 commit comments

Comments
 (0)