@@ -19,6 +19,7 @@ package kfto
19
19
import (
20
20
"fmt"
21
21
"testing"
22
+ "time"
22
23
23
24
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
24
25
. "github.com/onsi/gomega"
@@ -29,15 +30,39 @@ import (
29
30
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30
31
)
31
32
32
- func TestPyTorchJobWithCuda (t * testing.T ) {
33
- runKFTOPyTorchJob (t , GetCudaTrainingImage (), "nvidia.com/gpu" , 1 )
33
+ func TestPyTorchJobSingleNodeSingleGpuWithCuda (t * testing.T ) {
34
+ runKFTOPyTorchJob (t , GetCudaTrainingImage (), NVIDIA , 1 , 0 )
34
35
}
35
36
36
- func TestPyTorchJobWithROCm (t * testing.T ) {
37
- runKFTOPyTorchJob (t , GetROCmTrainingImage (), "amd.com/gpu" , 1 )
37
+ func TestPyTorchJobSingleNodeMultiGpuWithCuda (t * testing.T ) {
38
+ runKFTOPyTorchJob (t , GetCudaTrainingImage (), NVIDIA , 2 , 0 )
38
39
}
39
40
40
- func runKFTOPyTorchJob (t * testing.T , image string , gpuLabel string , numGpus int ) {
41
+ func TestPyTorchJobMultiNodeSingleGpuWithCuda (t * testing.T ) {
42
+ runKFTOPyTorchJob (t , GetCudaTrainingImage (), NVIDIA , 1 , 1 )
43
+ }
44
+
45
+ func TestPyTorchJobMultiNodeMultiGpuWithCuda (t * testing.T ) {
46
+ runKFTOPyTorchJob (t , GetCudaTrainingImage (), NVIDIA , 2 , 1 )
47
+ }
48
+
49
+ func TestPyTorchJobSingleNodeSingleGpuWithROCm (t * testing.T ) {
50
+ runKFTOPyTorchJob (t , GetROCmTrainingImage (), AMD , 1 , 0 )
51
+ }
52
+
53
+ func TestPyTorchJobSingleNodeMultiGpuWithROCm (t * testing.T ) {
54
+ runKFTOPyTorchJob (t , GetROCmTrainingImage (), AMD , 2 , 0 )
55
+ }
56
+
57
+ func TestPyTorchJobMultiNodeSingleGpuWithROCm (t * testing.T ) {
58
+ runKFTOPyTorchJob (t , GetROCmTrainingImage (), AMD , 1 , 1 )
59
+ }
60
+
61
+ func TestPyTorchJobMultiNodeMultiGpuWithROCm (t * testing.T ) {
62
+ runKFTOPyTorchJob (t , GetROCmTrainingImage (), AMD , 2 , 1 )
63
+ }
64
+
65
+ func runKFTOPyTorchJob (t * testing.T , image string , gpu Gpu , numGpus , numberOfWorkerNodes int ) {
41
66
test := With (t )
42
67
43
68
// Create a namespace
@@ -54,20 +79,40 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpuLabel string, numGpus int)
54
79
defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
55
80
56
81
// Create training PyTorch job
57
- tuningJob := createKFTOPyTorchJob (test , namespace , * config , gpuLabel , numGpus , outputPvc .Name , image )
82
+ tuningJob := createKFTOPyTorchJob (test , namespace , * config , gpu , numGpus , numberOfWorkerNodes , outputPvc .Name , image )
58
83
defer test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace ).Delete (test .Ctx (), tuningJob .Name , * metav1 .NewDeleteOptions (0 ))
59
84
60
85
// Make sure the PyTorch job is running
61
86
test .Eventually (PyTorchJob (test , namespace , tuningJob .Name ), TestTimeoutDouble ).
62
87
Should (WithTransform (PyTorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
63
88
89
+ // Verify GPU utilization
90
+ if IsOpenShift (test ) && gpu == NVIDIA {
91
+ trainingPods := GetPods (test , namespace , metav1.ListOptions {LabelSelector : "training.kubeflow.org/job-name=" + tuningJob .GetName ()})
92
+ test .Expect (trainingPods ).To (HaveLen (numberOfWorkerNodes + 1 )) // +1 is a master node
93
+
94
+ for _ , trainingPod := range trainingPods {
95
+ // Check that GPUs for training pods were utilized recently
96
+ test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , gpu ), 15 * time .Minute ).
97
+ Should (
98
+ And (
99
+ HaveLen (numGpus ),
100
+ ContainElement (
101
+ // Check that at lest some GPU was utilized on more than 50%
102
+ HaveField ("Value" , BeNumerically (">" , 50 )),
103
+ ),
104
+ ),
105
+ )
106
+ }
107
+ test .T ().Log ("All GPUs were successfully utilized" )
108
+ }
109
+
64
110
// Make sure the PyTorch job succeeded
65
111
test .Eventually (PyTorchJob (test , namespace , tuningJob .Name ), TestTimeoutDouble ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
66
112
test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
67
-
68
113
}
69
114
70
- func createKFTOPyTorchJob (test Test , namespace string , config corev1.ConfigMap , gpuLabel string , numGpus int , outputPvcName string , baseImage string ) * kftov1.PyTorchJob {
115
+ func createKFTOPyTorchJob (test Test , namespace string , config corev1.ConfigMap , gpu Gpu , numGpus , numberOfWorkerNodes int , outputPvcName string , baseImage string ) * kftov1.PyTorchJob {
71
116
tuningJob := & kftov1.PyTorchJob {
72
117
TypeMeta : metav1.TypeMeta {
73
118
APIVersion : corev1 .SchemeGroupVersion .String (),
@@ -78,14 +123,33 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
78
123
},
79
124
Spec : kftov1.PyTorchJobSpec {
80
125
PyTorchReplicaSpecs : map [kftov1.ReplicaType ]* kftov1.ReplicaSpec {
81
- "Master" : {
126
+ kftov1 . PyTorchJobReplicaTypeMaster : {
82
127
Replicas : Ptr (int32 (1 )),
83
128
RestartPolicy : "OnFailure" ,
84
129
Template : corev1.PodTemplateSpec {
130
+ ObjectMeta : metav1.ObjectMeta {
131
+ Labels : map [string ]string {
132
+ "app" : "kfto-llm" ,
133
+ },
134
+ },
85
135
Spec : corev1.PodSpec {
136
+ Affinity : & corev1.Affinity {
137
+ PodAntiAffinity : & corev1.PodAntiAffinity {
138
+ RequiredDuringSchedulingIgnoredDuringExecution : []corev1.PodAffinityTerm {
139
+ {
140
+ LabelSelector : & metav1.LabelSelector {
141
+ MatchLabels : map [string ]string {
142
+ "app" : "kfto-llm" ,
143
+ },
144
+ },
145
+ TopologyKey : "kubernetes.io/hostname" ,
146
+ },
147
+ },
148
+ },
149
+ },
86
150
Tolerations : []corev1.Toleration {
87
151
{
88
- Key : gpuLabel ,
152
+ Key : gpu . ResourceLabel ,
89
153
Operator : corev1 .TolerationOpExists ,
90
154
},
91
155
},
@@ -124,12 +188,12 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
124
188
ImagePullPolicy : corev1 .PullIfNotPresent ,
125
189
Command : []string {
126
190
"/bin/bash" , "-c" ,
127
- `python /etc/config/hf_llm_training.py \
191
+ `torchrun /etc/config/hf_llm_training.py \
128
192
--model_uri /tmp/model/bloom-560m \
129
193
--model_dir /tmp/model/bloom-560m \
130
- --dataset_file /tmp/all_datasets/alpaca_data_hundredth .json \
194
+ --dataset_file /tmp/all_datasets/alpaca_data_tenth .json \
131
195
--transformer_type AutoModelForCausalLM \
132
- --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch"}' \
196
+ --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no" }' \
133
197
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
134
198
},
135
199
Env : []corev1.EnvVar {
@@ -145,6 +209,10 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
145
209
Name : "TOKENIZERS_PARALLELISM" ,
146
210
Value : "false" ,
147
211
},
212
+ {
213
+ Name : "NCCL_DEBUG" ,
214
+ Value : "INFO" ,
215
+ },
148
216
},
149
217
VolumeMounts : []corev1.VolumeMount {
150
218
{
@@ -162,14 +230,14 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
162
230
},
163
231
Resources : corev1.ResourceRequirements {
164
232
Requests : corev1.ResourceList {
165
- corev1 .ResourceCPU : resource .MustParse ("2" ),
166
- corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
167
- corev1 .ResourceName (gpuLabel ): resource .MustParse (fmt .Sprint (numGpus )),
233
+ corev1 .ResourceCPU : resource .MustParse ("2" ),
234
+ corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
235
+ corev1 .ResourceName (gpu . ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
168
236
},
169
237
Limits : corev1.ResourceList {
170
- corev1 .ResourceCPU : resource .MustParse ("2" ),
171
- corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
172
- corev1 .ResourceName (gpuLabel ): resource .MustParse (fmt .Sprint (numGpus )),
238
+ corev1 .ResourceCPU : resource .MustParse ("2" ),
239
+ corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
240
+ corev1 .ResourceName (gpu . ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
173
241
},
174
242
},
175
243
SecurityContext : & corev1.SecurityContext {
@@ -207,6 +275,146 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
207
275
},
208
276
},
209
277
},
278
+ kftov1 .PyTorchJobReplicaTypeWorker : {
279
+ Replicas : Ptr (int32 (numberOfWorkerNodes )),
280
+ RestartPolicy : "OnFailure" ,
281
+ Template : corev1.PodTemplateSpec {
282
+ ObjectMeta : metav1.ObjectMeta {
283
+ Labels : map [string ]string {
284
+ "app" : "kfto-llm" ,
285
+ },
286
+ },
287
+ Spec : corev1.PodSpec {
288
+ Affinity : & corev1.Affinity {
289
+ PodAntiAffinity : & corev1.PodAntiAffinity {
290
+ RequiredDuringSchedulingIgnoredDuringExecution : []corev1.PodAffinityTerm {
291
+ {
292
+ LabelSelector : & metav1.LabelSelector {
293
+ MatchLabels : map [string ]string {
294
+ "app" : "kfto-llm" ,
295
+ },
296
+ },
297
+ TopologyKey : "kubernetes.io/hostname" ,
298
+ },
299
+ },
300
+ },
301
+ },
302
+ Tolerations : []corev1.Toleration {
303
+ {
304
+ Key : gpu .ResourceLabel ,
305
+ Operator : corev1 .TolerationOpExists ,
306
+ },
307
+ },
308
+ InitContainers : []corev1.Container {
309
+ {
310
+ Name : "copy-model" ,
311
+ Image : GetBloomModelImage (),
312
+ ImagePullPolicy : corev1 .PullIfNotPresent ,
313
+ VolumeMounts : []corev1.VolumeMount {
314
+ {
315
+ Name : "tmp-volume" ,
316
+ MountPath : "/tmp" ,
317
+ },
318
+ },
319
+ Command : []string {"/bin/sh" , "-c" },
320
+ Args : []string {"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model" },
321
+ },
322
+ {
323
+ Name : "copy-dataset" ,
324
+ Image : GetAlpacaDatasetImage (),
325
+ ImagePullPolicy : corev1 .PullIfNotPresent ,
326
+ VolumeMounts : []corev1.VolumeMount {
327
+ {
328
+ Name : "tmp-volume" ,
329
+ MountPath : "/tmp" ,
330
+ },
331
+ },
332
+ Command : []string {"/bin/sh" , "-c" },
333
+ Args : []string {"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets" },
334
+ },
335
+ },
336
+ Containers : []corev1.Container {
337
+ {
338
+ Name : "pytorch" ,
339
+ Image : baseImage ,
340
+ ImagePullPolicy : corev1 .PullIfNotPresent ,
341
+ Command : []string {
342
+ "/bin/bash" , "-c" ,
343
+ `torchrun /etc/config/hf_llm_training.py \
344
+ --model_uri /tmp/model/bloom-560m \
345
+ --model_dir /tmp/model/bloom-560m \
346
+ --dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
347
+ --transformer_type AutoModelForCausalLM \
348
+ --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
349
+ --lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
350
+ },
351
+ Env : []corev1.EnvVar {
352
+ {
353
+ Name : "HF_HOME" ,
354
+ Value : "/tmp/.cache" ,
355
+ },
356
+ {
357
+ Name : "TRITON_CACHE_DIR" ,
358
+ Value : "/tmp/.triton" ,
359
+ },
360
+ {
361
+ Name : "TOKENIZERS_PARALLELISM" ,
362
+ Value : "false" ,
363
+ },
364
+ {
365
+ Name : "NCCL_DEBUG" ,
366
+ Value : "INFO" ,
367
+ },
368
+ },
369
+ VolumeMounts : []corev1.VolumeMount {
370
+ {
371
+ Name : "config-volume" ,
372
+ MountPath : "/etc/config" ,
373
+ },
374
+ {
375
+ Name : "tmp-volume" ,
376
+ MountPath : "/tmp" ,
377
+ },
378
+ },
379
+ Resources : corev1.ResourceRequirements {
380
+ Requests : corev1.ResourceList {
381
+ corev1 .ResourceCPU : resource .MustParse ("2" ),
382
+ corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
383
+ corev1 .ResourceName (gpu .ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
384
+ },
385
+ Limits : corev1.ResourceList {
386
+ corev1 .ResourceCPU : resource .MustParse ("2" ),
387
+ corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
388
+ corev1 .ResourceName (gpu .ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
389
+ },
390
+ },
391
+ SecurityContext : & corev1.SecurityContext {
392
+ RunAsNonRoot : Ptr (true ),
393
+ ReadOnlyRootFilesystem : Ptr (true ),
394
+ },
395
+ },
396
+ },
397
+ Volumes : []corev1.Volume {
398
+ {
399
+ Name : "config-volume" ,
400
+ VolumeSource : corev1.VolumeSource {
401
+ ConfigMap : & corev1.ConfigMapVolumeSource {
402
+ LocalObjectReference : corev1.LocalObjectReference {
403
+ Name : config .Name ,
404
+ },
405
+ },
406
+ },
407
+ },
408
+ {
409
+ Name : "tmp-volume" ,
410
+ VolumeSource : corev1.VolumeSource {
411
+ EmptyDir : & corev1.EmptyDirVolumeSource {},
412
+ },
413
+ },
414
+ },
415
+ },
416
+ },
417
+ },
210
418
},
211
419
},
212
420
}
0 commit comments