@@ -20,6 +20,7 @@ import (
20
20
"bytes"
21
21
"fmt"
22
22
"testing"
23
+ "time"
23
24
24
25
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
25
26
. "github.com/onsi/gomega"
@@ -30,27 +31,30 @@ import (
30
31
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31
32
)
32
33
33
- func TestPyTorchJobMnistMultiNodeCpu (t * testing.T ) {
34
- runKFTOPyTorchMnistJob (t , 0 , 2 , "" , GetCudaTrainingImage (), "resources/requirements.txt" )
34
+ func TestPyTorchJobMnistMultiNodeSingleCpu (t * testing.T ) {
35
+ runKFTOPyTorchMnistJob (t , CPU , GetCudaTrainingImage (), "resources/requirements.txt" , 2 , 1 )
35
36
}
36
-
37
- func TestPyTorchJobMnistMultiNodeWithCuda (t * testing.T ) {
38
- runKFTOPyTorchMnistJob (t , 1 , 1 , "nvidia.com/gpu" , GetCudaTrainingImage (), "resources/requirements.txt" )
37
+ func TestPyTorchJobMnistMultiNodeMultiCpu (t * testing.T ) {
38
+ runKFTOPyTorchMnistJob (t , CPU , GetCudaTrainingImage (), "resources/requirements.txt" , 2 , 2 )
39
39
}
40
40
41
- func TestPyTorchJobMnistMultiNodeWithROCm (t * testing.T ) {
42
- runKFTOPyTorchMnistJob (t , 1 , 1 , "amd.com/gpu" , GetROCmTrainingImage (), "resources/requirements-rocm .txt" )
41
+ func TestPyTorchJobMnistMultiNodeSingleGpuWithCuda (t * testing.T ) {
42
+ runKFTOPyTorchMnistJob (t , NVIDIA , GetCudaTrainingImage (), "resources/requirements.txt" , 1 , 1 )
43
43
}
44
44
45
45
func TestPyTorchJobMnistMultiNodeMultiGpuWithCuda (t * testing.T ) {
46
- runKFTOPyTorchMnistJob (t , 2 , 1 , "nvidia.com/gpu" , GetCudaTrainingImage (), "resources/requirements.txt" )
46
+ runKFTOPyTorchMnistJob (t , NVIDIA , GetCudaTrainingImage (), "resources/requirements.txt" , 1 , 2 )
47
+ }
48
+
49
+ func TestPyTorchJobMnistMultiNodeSingleGpuWithROCm (t * testing.T ) {
50
+ runKFTOPyTorchMnistJob (t , AMD , GetROCmTrainingImage (), "resources/requirements-rocm.txt" , 1 , 1 )
47
51
}
48
52
49
53
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm (t * testing.T ) {
50
- runKFTOPyTorchMnistJob (t , 2 , 1 , "amd.com/gpu" , GetROCmTrainingImage (), "resources/requirements-rocm.txt" )
54
+ runKFTOPyTorchMnistJob (t , AMD , GetROCmTrainingImage (), "resources/requirements-rocm.txt" , 1 , 2 )
51
55
}
52
56
53
- func runKFTOPyTorchMnistJob (t * testing.T , numGpus int , workerReplicas int , gpuLabel string , image string , requirementsFile string ) {
57
+ func runKFTOPyTorchMnistJob (t * testing.T , accelerator Accelerator , image string , requirementsFile string , workerReplicas , numProcPerNode int ) {
54
58
test := With (t )
55
59
56
60
// Create a namespace
@@ -59,7 +63,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLa
59
63
mnist := ReadFile (test , "resources/mnist.py" )
60
64
requirementsFileName := ReadFile (test , requirementsFile )
61
65
62
- if numGpus > 0 {
66
+ if accelerator . isGpu () {
63
67
mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" gpu\" " ), 1 )
64
68
} else {
65
69
mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" cpu\" " ), 1 )
@@ -70,29 +74,44 @@ func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLa
70
74
"requirements.txt" : requirementsFileName ,
71
75
})
72
76
73
- outputPvc := CreatePersistentVolumeClaim (test , namespace .Name , "50Gi" , corev1 .ReadWriteOnce )
74
- defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace .Name ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
75
-
76
77
// Create training PyTorch job
77
- tuningJob := createKFTOPyTorchMnistJob (test , namespace .Name , * config , gpuLabel , numGpus , workerReplicas , outputPvc . Name , image )
78
+ tuningJob := createKFTOPyTorchMnistJob (test , namespace .Name , * config , accelerator , workerReplicas , numProcPerNode , image )
78
79
defer test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace .Name ).Delete (test .Ctx (), tuningJob .Name , * metav1 .NewDeleteOptions (0 ))
79
80
80
81
// Make sure the PyTorch job is running
81
82
test .Eventually (PyTorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutDouble ).
82
83
Should (WithTransform (PyTorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
83
84
85
+ // Verify GPU utilization
86
+ if IsOpenShift (test ) && accelerator == NVIDIA {
87
+ trainingPods := GetPods (test , namespace .Name , metav1.ListOptions {LabelSelector : "training.kubeflow.org/job-name=" + tuningJob .GetName ()})
88
+ test .Expect (trainingPods ).To (HaveLen (workerReplicas + 1 )) // +1 is a master node
89
+
90
+ for _ , trainingPod := range trainingPods {
91
+ // Check that GPUs for training pods were utilized recently
92
+ test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , accelerator ), 15 * time .Minute ).
93
+ Should (
94
+ And (
95
+ HaveLen (numProcPerNode ),
96
+ ContainElement (
97
+ // Check that at least some GPU was utilized on more than 20%
98
+ HaveField ("Value" , BeNumerically (">" , 20 )),
99
+ ),
100
+ ),
101
+ )
102
+ }
103
+ test .T ().Log ("All GPUs were successfully utilized" )
104
+ }
105
+
84
106
// Make sure the PyTorch job succeeded
85
107
test .Eventually (PyTorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutDouble ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
86
108
test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
87
109
88
110
}
89
111
90
- func createKFTOPyTorchMnistJob (test Test , namespace string , config corev1.ConfigMap , gpuLabel string , numGpus int , workerReplicas int , outputPvcName string , baseImage string ) * kftov1.PyTorchJob {
91
- var useGPU = false
112
+ func createKFTOPyTorchMnistJob (test Test , namespace string , config corev1.ConfigMap , accelerator Accelerator , workerReplicas int , numProcPerNode int , baseImage string ) * kftov1.PyTorchJob {
92
113
var backend string
93
-
94
- if numGpus > 0 {
95
- useGPU = true
114
+ if accelerator .isGpu () {
96
115
backend = "nccl"
97
116
} else {
98
117
backend = "gloo"
@@ -108,13 +127,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
108
127
},
109
128
Spec : kftov1.PyTorchJobSpec {
110
129
PyTorchReplicaSpecs : map [kftov1.ReplicaType ]* kftov1.ReplicaSpec {
111
- "Master" : {
130
+ kftov1 . PyTorchJobReplicaTypeMaster : {
112
131
Replicas : Ptr (int32 (1 )),
113
132
RestartPolicy : kftov1 .RestartPolicyOnFailure ,
114
133
Template : corev1.PodTemplateSpec {
115
134
ObjectMeta : metav1.ObjectMeta {
116
135
Labels : map [string ]string {
117
- "app" : "kfto-mnist" ,
136
+ "app" : "kfto-mnist" ,
137
+ "role" : "master" ,
118
138
},
119
139
},
120
140
Spec : corev1.PodSpec {
@@ -139,9 +159,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
139
159
ImagePullPolicy : corev1 .PullIfNotPresent ,
140
160
Command : []string {
141
161
"/bin/bash" , "-c" ,
142
- fmt .Sprintf (`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
162
+ fmt .Sprintf (`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
143
163
pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
144
- python /mnt/files/mnist.py --epochs 3 --save-model --output-path /mnt/output --backend %s` , backend ),
164
+ echo "Downloading MNIST dataset..." && \
165
+ python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
166
+ MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
167
+ echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
168
+ echo -e "\n\n Starting training..." && \
169
+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
145
170
},
146
171
VolumeMounts : []corev1.VolumeMount {
147
172
{
@@ -152,14 +177,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
152
177
Name : "tmp-volume" ,
153
178
MountPath : "/tmp" ,
154
179
},
155
- {
156
- Name : "output-volume" ,
157
- MountPath : "/mnt/output" ,
158
- },
159
180
},
160
181
Resources : corev1.ResourceRequirements {
182
+ Requests : corev1.ResourceList {
183
+ corev1 .ResourceCPU : resource .MustParse (fmt .Sprintf ("%d" , numProcPerNode )),
184
+ corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
185
+ },
161
186
Limits : corev1.ResourceList {
162
- corev1 .ResourceCPU : resource .MustParse ("1" ),
187
+ corev1 .ResourceCPU : resource .MustParse (fmt . Sprintf ( "%d" , numProcPerNode ) ),
163
188
corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
164
189
},
165
190
},
@@ -182,26 +207,19 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
182
207
EmptyDir : & corev1.EmptyDirVolumeSource {},
183
208
},
184
209
},
185
- {
186
- Name : "output-volume" ,
187
- VolumeSource : corev1.VolumeSource {
188
- PersistentVolumeClaim : & corev1.PersistentVolumeClaimVolumeSource {
189
- ClaimName : outputPvcName ,
190
- },
191
- },
192
- },
193
210
},
194
211
RestartPolicy : corev1 .RestartPolicyOnFailure ,
195
212
},
196
213
},
197
214
},
198
- "Worker" : {
215
+ kftov1 . PyTorchJobReplicaTypeWorker : {
199
216
Replicas : Ptr (int32 (workerReplicas )),
200
217
RestartPolicy : kftov1 .RestartPolicyOnFailure ,
201
218
Template : corev1.PodTemplateSpec {
202
219
ObjectMeta : metav1.ObjectMeta {
203
220
Labels : map [string ]string {
204
- "app" : "kfto-mnist" ,
221
+ "app" : "kfto-mnist" ,
222
+ "role" : "worker" ,
205
223
},
206
224
},
207
225
Spec : corev1.PodSpec {
@@ -226,9 +244,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
226
244
ImagePullPolicy : corev1 .PullIfNotPresent ,
227
245
Command : []string {
228
246
"/bin/bash" , "-c" ,
229
- fmt .Sprintf (`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
247
+ fmt .Sprintf (`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
230
248
pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
231
- python /mnt/files/mnist.py --epochs 3 --save-model --backend %s` , backend ),
249
+ echo "Downloading MNIST dataset..." && \
250
+ python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
251
+ MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
252
+ echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
253
+ echo -e "\n\n Starting training..." && \
254
+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
232
255
},
233
256
VolumeMounts : []corev1.VolumeMount {
234
257
{
@@ -241,8 +264,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
241
264
},
242
265
},
243
266
Resources : corev1.ResourceRequirements {
267
+ Requests : corev1.ResourceList {
268
+ corev1 .ResourceCPU : resource .MustParse (fmt .Sprintf ("%d" , numProcPerNode )),
269
+ corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
270
+ },
244
271
Limits : corev1.ResourceList {
245
- corev1 .ResourceCPU : resource .MustParse ("1" ),
272
+ corev1 .ResourceCPU : resource .MustParse (fmt . Sprintf ( "%d" , numProcPerNode ) ),
246
273
corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
247
274
},
248
275
},
@@ -274,34 +301,44 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
274
301
},
275
302
}
276
303
277
- if useGPU {
304
+ if accelerator . isGpu () {
278
305
// Update resource lists for GPU (NVIDIA/ROCm) usecase
279
- tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (gpuLabel )] = resource .MustParse (fmt .Sprint (numGpus ))
280
- tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (gpuLabel )] = resource .MustParse (fmt .Sprint (numGpus ))
306
+ tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
307
+ tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
308
+ tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
309
+ tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
281
310
282
311
tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Env = []corev1.EnvVar {
283
312
{
284
313
Name : "NCCL_DEBUG" ,
285
314
Value : "INFO" ,
286
315
},
316
+ {
317
+ Name : "TORCH_DISTRIBUTED_DEBUG" ,
318
+ Value : "DETAIL" ,
319
+ },
287
320
}
288
321
tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Env = []corev1.EnvVar {
289
322
{
290
323
Name : "NCCL_DEBUG" ,
291
324
Value : "INFO" ,
292
325
},
326
+ {
327
+ Name : "TORCH_DISTRIBUTED_DEBUG" ,
328
+ Value : "DETAIL" ,
329
+ },
293
330
}
294
331
295
332
// Update tolerations
296
333
tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Tolerations = []corev1.Toleration {
297
334
{
298
- Key : gpuLabel ,
335
+ Key : accelerator . ResourceLabel ,
299
336
Operator : corev1 .TolerationOpExists ,
300
337
},
301
338
}
302
339
tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Tolerations = []corev1.Toleration {
303
340
{
304
- Key : gpuLabel ,
341
+ Key : accelerator . ResourceLabel ,
305
342
Operator : corev1 .TolerationOpExists ,
306
343
},
307
344
}
0 commit comments