@@ -54,7 +54,7 @@ func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm(t *testing.T) {
54
54
runKFTOPyTorchMnistJob (t , AMD , GetROCmTrainingImage (), "resources/requirements-rocm.txt" , 1 , 2 )
55
55
}
56
56
57
- func runKFTOPyTorchMnistJob (t * testing.T , gpu Gpu , image string , requirementsFile string , workerReplicas , numProcPerNode int ) {
57
+ func runKFTOPyTorchMnistJob (t * testing.T , accelerator Accelerator , image string , requirementsFile string , workerReplicas , numProcPerNode int ) {
58
58
test := With (t )
59
59
60
60
// Create a namespace
@@ -63,7 +63,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFil
63
63
mnist := ReadFile (test , "resources/mnist.py" )
64
64
requirementsFileName := ReadFile (test , requirementsFile )
65
65
66
- if workerReplicas * numProcPerNode > 0 && gpu . ResourceLabel != "cpu" {
66
+ if accelerator . isGpu () {
67
67
mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" gpu\" " ), 1 )
68
68
} else {
69
69
mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" cpu\" " ), 1 )
@@ -74,31 +74,28 @@ func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFil
74
74
"requirements.txt" : requirementsFileName ,
75
75
})
76
76
77
- outputPvc := CreatePersistentVolumeClaim (test , namespace .Name , "50Gi" , corev1 .ReadWriteOnce )
78
- defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace .Name ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
79
-
80
77
// Create training PyTorch job
81
- tuningJob := createKFTOPyTorchMnistJob (test , namespace .Name , * config , gpu , workerReplicas , numProcPerNode , outputPvc . Name , image )
78
+ tuningJob := createKFTOPyTorchMnistJob (test , namespace .Name , * config , accelerator , workerReplicas , numProcPerNode , image )
82
79
defer test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace .Name ).Delete (test .Ctx (), tuningJob .Name , * metav1 .NewDeleteOptions (0 ))
83
80
84
81
// Make sure the PyTorch job is running
85
82
test .Eventually (PyTorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutDouble ).
86
83
Should (WithTransform (PyTorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
87
84
88
85
// Verify GPU utilization
89
- if IsOpenShift (test ) && gpu == NVIDIA {
86
+ if IsOpenShift (test ) && accelerator == NVIDIA {
90
87
trainingPods := GetPods (test , namespace .Name , metav1.ListOptions {LabelSelector : "training.kubeflow.org/job-name=" + tuningJob .GetName ()})
91
88
test .Expect (trainingPods ).To (HaveLen (workerReplicas + 1 )) // +1 is a master node
92
89
93
90
for _ , trainingPod := range trainingPods {
94
91
// Check that GPUs for training pods were utilized recently
95
- test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , gpu ), 15 * time .Minute ).
92
+ test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , accelerator ), 15 * time .Minute ).
96
93
Should (
97
94
And (
98
95
HaveLen (numProcPerNode ),
99
96
ContainElement (
100
- // Check that at least some GPU was utilized on more than 30 %
101
- HaveField ("Value" , BeNumerically (">" , 30 )),
97
+ // Check that at least some GPU was utilized on more than 20 %
98
+ HaveField ("Value" , BeNumerically (">" , 20 )),
102
99
),
103
100
),
104
101
)
@@ -112,12 +109,9 @@ func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFil
112
109
113
110
}
114
111
115
- func createKFTOPyTorchMnistJob (test Test , namespace string , config corev1.ConfigMap , gpu Gpu , workerReplicas int , numProcPerNode int , outputPvcName string , baseImage string ) * kftov1.PyTorchJob {
116
- var useGPU = false
112
+ func createKFTOPyTorchMnistJob (test Test , namespace string , config corev1.ConfigMap , accelerator Accelerator , workerReplicas int , numProcPerNode int , baseImage string ) * kftov1.PyTorchJob {
117
113
var backend string
118
-
119
- if gpu .ResourceLabel != "cpu" {
120
- useGPU = true
114
+ if accelerator .isGpu () {
121
115
backend = "nccl"
122
116
} else {
123
117
backend = "gloo"
@@ -172,7 +166,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
172
166
MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
173
167
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
174
168
echo -e "\n\n Starting training..." && \
175
- torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 64 --lr 0.0005 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
169
+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
176
170
},
177
171
VolumeMounts : []corev1.VolumeMount {
178
172
{
@@ -257,7 +251,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
257
251
MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
258
252
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
259
253
echo -e "\n\n Starting training..." && \
260
- torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 64 --lr 0.0005 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
254
+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
261
255
},
262
256
VolumeMounts : []corev1.VolumeMount {
263
257
{
@@ -307,12 +301,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
307
301
},
308
302
}
309
303
310
- if useGPU {
304
+ if accelerator . isGpu () {
311
305
// Update resource lists for GPU (NVIDIA/ROCm) usecase
312
- tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (gpu .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
313
- tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (gpu .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
314
- tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (gpu .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
315
- tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (gpu .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
306
+ tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
307
+ tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
308
+ tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
309
+ tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
316
310
317
311
tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Env = []corev1.EnvVar {
318
312
{
@@ -338,13 +332,13 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
338
332
// Update tolerations
339
333
tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Tolerations = []corev1.Toleration {
340
334
{
341
- Key : gpu .ResourceLabel ,
335
+ Key : accelerator .ResourceLabel ,
342
336
Operator : corev1 .TolerationOpExists ,
343
337
},
344
338
}
345
339
tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Tolerations = []corev1.Toleration {
346
340
{
347
- Key : gpu .ResourceLabel ,
341
+ Key : accelerator .ResourceLabel ,
348
342
Operator : corev1 .TolerationOpExists ,
349
343
},
350
344
}
0 commit comments