Skip to content

Commit 92575c8

Browse files
Update Gpu struct to Accelerator and add isGpu method to be used for KFTO tests
1 parent 3c2ca7d commit 92575c8

File tree

3 files changed

+30
-31
lines changed

3 files changed

+30
-31
lines changed

tests/kfto/kfto_mnist_training_test.go

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm(t *testing.T) {
5454
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 2)
5555
}
5656

57-
func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFile string, workerReplicas, numProcPerNode int) {
57+
func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string, requirementsFile string, workerReplicas, numProcPerNode int) {
5858
test := With(t)
5959

6060
// Create a namespace
@@ -63,7 +63,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFil
6363
mnist := ReadFile(test, "resources/mnist.py")
6464
requirementsFileName := ReadFile(test, requirementsFile)
6565

66-
if workerReplicas*numProcPerNode > 0 && gpu.ResourceLabel != "cpu" {
66+
if accelerator.isGpu() {
6767
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1)
6868
} else {
6969
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"cpu\""), 1)
@@ -74,31 +74,28 @@ func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFil
7474
"requirements.txt": requirementsFileName,
7575
})
7676

77-
outputPvc := CreatePersistentVolumeClaim(test, namespace.Name, "50Gi", corev1.ReadWriteOnce)
78-
defer test.Client().Core().CoreV1().PersistentVolumeClaims(namespace.Name).Delete(test.Ctx(), outputPvc.Name, metav1.DeleteOptions{})
79-
8077
// Create training PyTorch job
81-
tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, gpu, workerReplicas, numProcPerNode, outputPvc.Name, image)
78+
tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, accelerator, workerReplicas, numProcPerNode, image)
8279
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0))
8380

8481
// Make sure the PyTorch job is running
8582
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
8683
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
8784

8885
// Verify GPU utilization
89-
if IsOpenShift(test) && gpu == NVIDIA {
86+
if IsOpenShift(test) && accelerator == NVIDIA {
9087
trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()})
9188
test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node
9289

9390
for _, trainingPod := range trainingPods {
9491
// Check that GPUs for training pods were utilized recently
95-
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, gpu), 15*time.Minute).
92+
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute).
9693
Should(
9794
And(
9895
HaveLen(numProcPerNode),
9996
ContainElement(
100-
// Check that at least some GPU was utilized on more than 30%
101-
HaveField("Value", BeNumerically(">", 30)),
97+
// Check that at least some GPU was utilized on more than 20%
98+
HaveField("Value", BeNumerically(">", 20)),
10299
),
103100
),
104101
)
@@ -112,12 +109,9 @@ func runKFTOPyTorchMnistJob(t *testing.T, gpu Gpu, image string, requirementsFil
112109

113110
}
114111

115-
func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, gpu Gpu, workerReplicas int, numProcPerNode int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
116-
var useGPU = false
112+
func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, accelerator Accelerator, workerReplicas int, numProcPerNode int, baseImage string) *kftov1.PyTorchJob {
117113
var backend string
118-
119-
if gpu.ResourceLabel != "cpu" {
120-
useGPU = true
114+
if accelerator.isGpu() {
121115
backend = "nccl"
122116
} else {
123117
backend = "gloo"
@@ -172,7 +166,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
172166
MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
173167
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
174168
echo -e "\n\n Starting training..." && \
175-
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 64 --lr 0.0005 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
169+
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
176170
},
177171
VolumeMounts: []corev1.VolumeMount{
178172
{
@@ -257,7 +251,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
257251
MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
258252
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
259253
echo -e "\n\n Starting training..." && \
260-
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 64 --lr 0.0005 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
254+
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
261255
},
262256
VolumeMounts: []corev1.VolumeMount{
263257
{
@@ -307,12 +301,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
307301
},
308302
}
309303

310-
if useGPU {
304+
if accelerator.isGpu() {
311305
// Update resource lists for GPU (NVIDIA/ROCm) usecase
312-
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceName(gpu.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
313-
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(gpu.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
314-
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceName(gpu.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
315-
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(gpu.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
306+
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
307+
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
308+
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
309+
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
316310

317311
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Env = []corev1.EnvVar{
318312
{
@@ -338,13 +332,13 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
338332
// Update tolerations
339333
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Tolerations = []corev1.Toleration{
340334
{
341-
Key: gpu.ResourceLabel,
335+
Key: accelerator.ResourceLabel,
342336
Operator: corev1.TolerationOpExists,
343337
},
344338
}
345339
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Tolerations = []corev1.Toleration{
346340
{
347-
Key: gpu.ResourceLabel,
341+
Key: accelerator.ResourceLabel,
348342
Operator: corev1.TolerationOpExists,
349343
},
350344
}

tests/kfto/kfto_training_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func TestPyTorchJobMultiNodeMultiGpuWithROCm(t *testing.T) {
6262
runKFTOPyTorchJob(t, GetROCmTrainingImage(), AMD, 2, 1)
6363
}
6464

65-
func runKFTOPyTorchJob(t *testing.T, image string, gpu Gpu, numGpus, numberOfWorkerNodes int) {
65+
func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, numberOfWorkerNodes int) {
6666
test := With(t)
6767

6868
// Create a namespace
@@ -112,7 +112,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Gpu, numGpus, numberOfWor
112112
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
113113
}
114114

115-
func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap, gpu Gpu, numGpus, numberOfWorkerNodes int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
115+
func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap, gpu Accelerator, numGpus, numberOfWorkerNodes int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
116116
tuningJob := &kftov1.PyTorchJob{
117117
TypeMeta: metav1.TypeMeta{
118118
APIVersion: corev1.SchemeGroupVersion.String(),

tests/kfto/support.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,22 @@ import (
2828
corev1 "k8s.io/api/core/v1"
2929
)
3030

31-
type Gpu struct {
31+
type Accelerator struct {
3232
ResourceLabel string
3333
PrometheusGpuUtilizationLabel string
3434
}
3535

3636
var (
37-
NVIDIA = Gpu{ResourceLabel: "nvidia.com/gpu", PrometheusGpuUtilizationLabel: "DCGM_FI_DEV_GPU_UTIL"}
38-
AMD = Gpu{ResourceLabel: "amd.com/gpu"}
39-
CPU = Gpu{ResourceLabel: "cpu"}
37+
NVIDIA = Accelerator{ResourceLabel: "nvidia.com/gpu", PrometheusGpuUtilizationLabel: "DCGM_FI_DEV_GPU_UTIL"}
38+
AMD = Accelerator{ResourceLabel: "amd.com/gpu"}
39+
CPU = Accelerator{}
4040
)
4141

42+
// Method to check if the accelerator is a GPU
43+
func (a Accelerator) isGpu() bool {
44+
return a != CPU
45+
}
46+
4247
//go:embed resources/*
4348
var files embed.FS
4449

@@ -49,7 +54,7 @@ func ReadFile(t Test, fileName string) []byte {
4954
return file
5055
}
5156

52-
func OpenShiftPrometheusGpuUtil(test Test, pod corev1.Pod, gpu Gpu) func(g Gomega) prometheusmodel.Vector {
57+
func OpenShiftPrometheusGpuUtil(test Test, pod corev1.Pod, gpu Accelerator) func(g Gomega) prometheusmodel.Vector {
5358
return func(g Gomega) prometheusmodel.Vector {
5459
prometheusApiClient := GetOpenShiftPrometheusApiClient(test)
5560
result, warnings, err := prometheusApiClient.Query(test.Ctx(), gpu.PrometheusGpuUtilizationLabel, time.Now(), prometheusapiv1.WithTimeout(5*time.Second))

0 commit comments

Comments
 (0)