@@ -20,6 +20,7 @@ import (
2020 "bytes"
2121 "fmt"
2222 "testing"
23+ "time"
2324
2425 kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
2526 . "github.com/onsi/gomega"
@@ -30,27 +31,30 @@ import (
3031 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3132)
3233
33- func TestPyTorchJobMnistMultiNodeCpu (t * testing.T ) {
34- runKFTOPyTorchMnistJob (t , 0 , 2 , "" , GetCudaTrainingImage (), "resources/requirements.txt" )
34+ func TestPyTorchJobMnistMultiNodeSingleCpu (t * testing.T ) {
35+ runKFTOPyTorchMnistJob (t , CPU , GetCudaTrainingImage (), "resources/requirements.txt" , 2 , 1 )
3536}
36-
37- func TestPyTorchJobMnistMultiNodeWithCuda (t * testing.T ) {
38- runKFTOPyTorchMnistJob (t , 1 , 1 , "nvidia.com/gpu" , GetCudaTrainingImage (), "resources/requirements.txt" )
37+ func TestPyTorchJobMnistMultiNodeMultiCpu (t * testing.T ) {
38+ runKFTOPyTorchMnistJob (t , CPU , GetCudaTrainingImage (), "resources/requirements.txt" , 2 , 2 )
3939}
4040
41- func TestPyTorchJobMnistMultiNodeWithROCm (t * testing.T ) {
42- runKFTOPyTorchMnistJob (t , 1 , 1 , "amd.com/gpu" , GetROCmTrainingImage (), "resources/requirements-rocm .txt" )
41+ func TestPyTorchJobMnistMultiNodeSingleGpuWithCuda (t * testing.T ) {
42+ runKFTOPyTorchMnistJob (t , NVIDIA , GetCudaTrainingImage (), "resources/requirements.txt" , 1 , 1 )
4343}
4444
4545func TestPyTorchJobMnistMultiNodeMultiGpuWithCuda (t * testing.T ) {
46- runKFTOPyTorchMnistJob (t , 2 , 1 , "nvidia.com/gpu" , GetCudaTrainingImage (), "resources/requirements.txt" )
46+ runKFTOPyTorchMnistJob (t , NVIDIA , GetCudaTrainingImage (), "resources/requirements.txt" , 1 , 2 )
47+ }
48+
49+ func TestPyTorchJobMnistMultiNodeSingleGpuWithROCm (t * testing.T ) {
50+ runKFTOPyTorchMnistJob (t , AMD , GetROCmTrainingImage (), "resources/requirements-rocm.txt" , 1 , 1 )
4751}
4852
4953func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm (t * testing.T ) {
50- runKFTOPyTorchMnistJob (t , 2 , 1 , "amd.com/gpu" , GetROCmTrainingImage (), "resources/requirements-rocm.txt" )
54+ runKFTOPyTorchMnistJob (t , AMD , GetROCmTrainingImage (), "resources/requirements-rocm.txt" , 1 , 2 )
5155}
5256
53- func runKFTOPyTorchMnistJob (t * testing.T , numGpus int , workerReplicas int , gpuLabel string , image string , requirementsFile string ) {
57+ func runKFTOPyTorchMnistJob (t * testing.T , accelerator Accelerator , image string , requirementsFile string , workerReplicas , numProcPerNode int ) {
5458 test := With (t )
5559
5660 // Create a namespace
@@ -59,7 +63,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLa
5963 mnist := ReadFile (test , "resources/mnist.py" )
6064 requirementsFileName := ReadFile (test , requirementsFile )
6165
62- if numGpus > 0 {
66+ if accelerator . isGpu () {
6367 mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" gpu\" " ), 1 )
6468 } else {
6569 mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" cpu\" " ), 1 )
@@ -70,29 +74,44 @@ func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLa
7074 "requirements.txt" : requirementsFileName ,
7175 })
7276
73- outputPvc := CreatePersistentVolumeClaim (test , namespace .Name , "50Gi" , corev1 .ReadWriteOnce )
74- defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace .Name ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
75-
7677 // Create training PyTorch job
77- tuningJob := createKFTOPyTorchMnistJob (test , namespace .Name , * config , gpuLabel , numGpus , workerReplicas , outputPvc . Name , image )
78+ tuningJob := createKFTOPyTorchMnistJob (test , namespace .Name , * config , accelerator , workerReplicas , numProcPerNode , image )
7879 defer test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace .Name ).Delete (test .Ctx (), tuningJob .Name , * metav1 .NewDeleteOptions (0 ))
7980
8081 // Make sure the PyTorch job is running
8182 test .Eventually (PyTorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutDouble ).
8283 Should (WithTransform (PyTorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
8384
85+ // Verify GPU utilization
86+ if IsOpenShift (test ) && accelerator == NVIDIA {
87+ trainingPods := GetPods (test , namespace .Name , metav1.ListOptions {LabelSelector : "training.kubeflow.org/job-name=" + tuningJob .GetName ()})
88+ test .Expect (trainingPods ).To (HaveLen (workerReplicas + 1 )) // +1 is a master node
89+
90+ for _ , trainingPod := range trainingPods {
91+ // Check that GPUs for training pods were utilized recently
92+ test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , accelerator ), 15 * time .Minute ).
93+ Should (
94+ And (
95+ HaveLen (numProcPerNode ),
96+ ContainElement (
97+ // Check that at least some GPU was utilized on more than 20%
98+ HaveField ("Value" , BeNumerically (">" , 20 )),
99+ ),
100+ ),
101+ )
102+ }
103+ test .T ().Log ("All GPUs were successfully utilized" )
104+ }
105+
84106 // Make sure the PyTorch job succeeded
85107 test .Eventually (PyTorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutDouble ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
86108 test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
87109
88110}
89111
90- func createKFTOPyTorchMnistJob (test Test , namespace string , config corev1.ConfigMap , gpuLabel string , numGpus int , workerReplicas int , outputPvcName string , baseImage string ) * kftov1.PyTorchJob {
91- var useGPU = false
112+ func createKFTOPyTorchMnistJob (test Test , namespace string , config corev1.ConfigMap , accelerator Accelerator , workerReplicas int , numProcPerNode int , baseImage string ) * kftov1.PyTorchJob {
92113 var backend string
93-
94- if numGpus > 0 {
95- useGPU = true
114+ if accelerator .isGpu () {
96115 backend = "nccl"
97116 } else {
98117 backend = "gloo"
@@ -108,13 +127,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
108127 },
109128 Spec : kftov1.PyTorchJobSpec {
110129 PyTorchReplicaSpecs : map [kftov1.ReplicaType ]* kftov1.ReplicaSpec {
111- "Master" : {
130+ kftov1 . PyTorchJobReplicaTypeMaster : {
112131 Replicas : Ptr (int32 (1 )),
113132 RestartPolicy : kftov1 .RestartPolicyOnFailure ,
114133 Template : corev1.PodTemplateSpec {
115134 ObjectMeta : metav1.ObjectMeta {
116135 Labels : map [string ]string {
117- "app" : "kfto-mnist" ,
136+ "app" : "kfto-mnist" ,
137+ "role" : "master" ,
118138 },
119139 },
120140 Spec : corev1.PodSpec {
@@ -139,9 +159,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
139159 ImagePullPolicy : corev1 .PullIfNotPresent ,
140160 Command : []string {
141161 "/bin/bash" , "-c" ,
142- fmt .Sprintf (`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
162+ fmt .Sprintf (`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
143163 pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
144- python /mnt/files/mnist.py --epochs 3 --save-model --output-path /mnt/output --backend %s` , backend ),
164+ echo "Downloading MNIST dataset..." && \
165+ python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
166+ MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
167+ echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
168+ echo -e "\n\n Starting training..." && \
169+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
145170 },
146171 VolumeMounts : []corev1.VolumeMount {
147172 {
@@ -152,14 +177,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
152177 Name : "tmp-volume" ,
153178 MountPath : "/tmp" ,
154179 },
155- {
156- Name : "output-volume" ,
157- MountPath : "/mnt/output" ,
158- },
159180 },
160181 Resources : corev1.ResourceRequirements {
182+ Requests : corev1.ResourceList {
183+ corev1 .ResourceCPU : resource .MustParse (fmt .Sprintf ("%d" , numProcPerNode )),
184+ corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
185+ },
161186 Limits : corev1.ResourceList {
162- corev1 .ResourceCPU : resource .MustParse ("1" ),
187+ corev1 .ResourceCPU : resource .MustParse (fmt . Sprintf ( "%d" , numProcPerNode ) ),
163188 corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
164189 },
165190 },
@@ -182,26 +207,19 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
182207 EmptyDir : & corev1.EmptyDirVolumeSource {},
183208 },
184209 },
185- {
186- Name : "output-volume" ,
187- VolumeSource : corev1.VolumeSource {
188- PersistentVolumeClaim : & corev1.PersistentVolumeClaimVolumeSource {
189- ClaimName : outputPvcName ,
190- },
191- },
192- },
193210 },
194211 RestartPolicy : corev1 .RestartPolicyOnFailure ,
195212 },
196213 },
197214 },
198- "Worker" : {
215+ kftov1 . PyTorchJobReplicaTypeWorker : {
199216 Replicas : Ptr (int32 (workerReplicas )),
200217 RestartPolicy : kftov1 .RestartPolicyOnFailure ,
201218 Template : corev1.PodTemplateSpec {
202219 ObjectMeta : metav1.ObjectMeta {
203220 Labels : map [string ]string {
204- "app" : "kfto-mnist" ,
221+ "app" : "kfto-mnist" ,
222+ "role" : "worker" ,
205223 },
206224 },
207225 Spec : corev1.PodSpec {
@@ -226,9 +244,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
226244 ImagePullPolicy : corev1 .PullIfNotPresent ,
227245 Command : []string {
228246 "/bin/bash" , "-c" ,
229- fmt .Sprintf (`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
247+ fmt .Sprintf (`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
230248 pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
231- python /mnt/files/mnist.py --epochs 3 --save-model --backend %s` , backend ),
249+ echo "Downloading MNIST dataset..." && \
250+ python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
251+ MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
252+ echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
253+ echo -e "\n\n Starting training..." && \
254+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
232255 },
233256 VolumeMounts : []corev1.VolumeMount {
234257 {
@@ -241,8 +264,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
241264 },
242265 },
243266 Resources : corev1.ResourceRequirements {
267+ Requests : corev1.ResourceList {
268+ corev1 .ResourceCPU : resource .MustParse (fmt .Sprintf ("%d" , numProcPerNode )),
269+ corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
270+ },
244271 Limits : corev1.ResourceList {
245- corev1 .ResourceCPU : resource .MustParse ("1" ),
272+ corev1 .ResourceCPU : resource .MustParse (fmt . Sprintf ( "%d" , numProcPerNode ) ),
246273 corev1 .ResourceMemory : resource .MustParse ("6Gi" ),
247274 },
248275 },
@@ -274,34 +301,44 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
274301 },
275302 }
276303
277- if useGPU {
304+ if accelerator . isGpu () {
278305 // Update resource lists for GPU (NVIDIA/ROCm) usecase
279- tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (gpuLabel )] = resource .MustParse (fmt .Sprint (numGpus ))
280- tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (gpuLabel )] = resource .MustParse (fmt .Sprint (numGpus ))
306+ tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
307+ tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
308+ tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Requests [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
309+ tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Resources .Limits [corev1 .ResourceName (accelerator .ResourceLabel )] = resource .MustParse (fmt .Sprint (numProcPerNode ))
281310
282311 tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Containers [0 ].Env = []corev1.EnvVar {
283312 {
284313 Name : "NCCL_DEBUG" ,
285314 Value : "INFO" ,
286315 },
316+ {
317+ Name : "TORCH_DISTRIBUTED_DEBUG" ,
318+ Value : "DETAIL" ,
319+ },
287320 }
288321 tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Containers [0 ].Env = []corev1.EnvVar {
289322 {
290323 Name : "NCCL_DEBUG" ,
291324 Value : "INFO" ,
292325 },
326+ {
327+ Name : "TORCH_DISTRIBUTED_DEBUG" ,
328+ Value : "DETAIL" ,
329+ },
293330 }
294331
295332 // Update tolerations
296333 tuningJob .Spec .PyTorchReplicaSpecs ["Master" ].Template .Spec .Tolerations = []corev1.Toleration {
297334 {
298- Key : gpuLabel ,
335+ Key : accelerator . ResourceLabel ,
299336 Operator : corev1 .TolerationOpExists ,
300337 },
301338 }
302339 tuningJob .Spec .PyTorchReplicaSpecs ["Worker" ].Template .Spec .Tolerations = []corev1.Toleration {
303340 {
304- Key : gpuLabel ,
341+ Key : accelerator . ResourceLabel ,
305342 Operator : corev1 .TolerationOpExists ,
306343 },
307344 }
0 commit comments