Skip to content

Commit de50808

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents face046 + 92575c8 commit de50808

File tree

4 files changed

+244
-253
lines changed

4 files changed

+244
-253
lines changed

tests/kfto/kfto_mnist_training_test.go

Lines changed: 84 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bytes"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
2526
. "github.com/onsi/gomega"
@@ -30,27 +31,30 @@ import (
3031
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3132
)
3233

33-
func TestPyTorchJobMnistMultiNodeCpu(t *testing.T) {
34-
runKFTOPyTorchMnistJob(t, 0, 2, "", GetCudaTrainingImage(), "resources/requirements.txt")
34+
func TestPyTorchJobMnistMultiNodeSingleCpu(t *testing.T) {
35+
runKFTOPyTorchMnistJob(t, CPU, GetCudaTrainingImage(), "resources/requirements.txt", 2, 1)
3536
}
36-
37-
func TestPyTorchJobMnistMultiNodeWithCuda(t *testing.T) {
38-
runKFTOPyTorchMnistJob(t, 1, 1, "nvidia.com/gpu", GetCudaTrainingImage(), "resources/requirements.txt")
37+
func TestPyTorchJobMnistMultiNodeMultiCpu(t *testing.T) {
38+
runKFTOPyTorchMnistJob(t, CPU, GetCudaTrainingImage(), "resources/requirements.txt", 2, 2)
3939
}
4040

41-
func TestPyTorchJobMnistMultiNodeWithROCm(t *testing.T) {
42-
runKFTOPyTorchMnistJob(t, 1, 1, "amd.com/gpu", GetROCmTrainingImage(), "resources/requirements-rocm.txt")
41+
func TestPyTorchJobMnistMultiNodeSingleGpuWithCuda(t *testing.T) {
42+
runKFTOPyTorchMnistJob(t, NVIDIA, GetCudaTrainingImage(), "resources/requirements.txt", 1, 1)
4343
}
4444

4545
func TestPyTorchJobMnistMultiNodeMultiGpuWithCuda(t *testing.T) {
46-
runKFTOPyTorchMnistJob(t, 2, 1, "nvidia.com/gpu", GetCudaTrainingImage(), "resources/requirements.txt")
46+
runKFTOPyTorchMnistJob(t, NVIDIA, GetCudaTrainingImage(), "resources/requirements.txt", 1, 2)
47+
}
48+
49+
func TestPyTorchJobMnistMultiNodeSingleGpuWithROCm(t *testing.T) {
50+
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 1)
4751
}
4852

4953
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm(t *testing.T) {
50-
runKFTOPyTorchMnistJob(t, 2, 1, "amd.com/gpu", GetROCmTrainingImage(), "resources/requirements-rocm.txt")
54+
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 2)
5155
}
5256

53-
func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLabel string, image string, requirementsFile string) {
57+
func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string, requirementsFile string, workerReplicas, numProcPerNode int) {
5458
test := With(t)
5559

5660
// Create a namespace
@@ -59,7 +63,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLa
5963
mnist := ReadFile(test, "resources/mnist.py")
6064
requirementsFileName := ReadFile(test, requirementsFile)
6165

62-
if numGpus > 0 {
66+
if accelerator.isGpu() {
6367
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1)
6468
} else {
6569
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"cpu\""), 1)
@@ -70,29 +74,44 @@ func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLa
7074
"requirements.txt": requirementsFileName,
7175
})
7276

73-
outputPvc := CreatePersistentVolumeClaim(test, namespace.Name, "50Gi", corev1.ReadWriteOnce)
74-
defer test.Client().Core().CoreV1().PersistentVolumeClaims(namespace.Name).Delete(test.Ctx(), outputPvc.Name, metav1.DeleteOptions{})
75-
7677
// Create training PyTorch job
77-
tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, gpuLabel, numGpus, workerReplicas, outputPvc.Name, image)
78+
tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, accelerator, workerReplicas, numProcPerNode, image)
7879
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0))
7980

8081
// Make sure the PyTorch job is running
8182
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
8283
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
8384

85+
// Verify GPU utilization
86+
if IsOpenShift(test) && accelerator == NVIDIA {
87+
trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()})
88+
test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node
89+
90+
for _, trainingPod := range trainingPods {
91+
// Check that GPUs for training pods were utilized recently
92+
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute).
93+
Should(
94+
And(
95+
HaveLen(numProcPerNode),
96+
ContainElement(
97+
// Check that at least some GPU was utilized on more than 20%
98+
HaveField("Value", BeNumerically(">", 20)),
99+
),
100+
),
101+
)
102+
}
103+
test.T().Log("All GPUs were successfully utilized")
104+
}
105+
84106
// Make sure the PyTorch job succeeded
85107
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
86108
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
87109

88110
}
89111

90-
func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, gpuLabel string, numGpus int, workerReplicas int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
91-
var useGPU = false
112+
func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, accelerator Accelerator, workerReplicas int, numProcPerNode int, baseImage string) *kftov1.PyTorchJob {
92113
var backend string
93-
94-
if numGpus > 0 {
95-
useGPU = true
114+
if accelerator.isGpu() {
96115
backend = "nccl"
97116
} else {
98117
backend = "gloo"
@@ -108,13 +127,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
108127
},
109128
Spec: kftov1.PyTorchJobSpec{
110129
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
111-
"Master": {
130+
kftov1.PyTorchJobReplicaTypeMaster: {
112131
Replicas: Ptr(int32(1)),
113132
RestartPolicy: kftov1.RestartPolicyOnFailure,
114133
Template: corev1.PodTemplateSpec{
115134
ObjectMeta: metav1.ObjectMeta{
116135
Labels: map[string]string{
117-
"app": "kfto-mnist",
136+
"app": "kfto-mnist",
137+
"role": "master",
118138
},
119139
},
120140
Spec: corev1.PodSpec{
@@ -139,9 +159,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
139159
ImagePullPolicy: corev1.PullIfNotPresent,
140160
Command: []string{
141161
"/bin/bash", "-c",
142-
fmt.Sprintf(`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
162+
fmt.Sprintf(`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
143163
pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
144-
python /mnt/files/mnist.py --epochs 3 --save-model --output-path /mnt/output --backend %s`, backend),
164+
echo "Downloading MNIST dataset..." && \
165+
python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
166+
MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
167+
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
168+
echo -e "\n\n Starting training..." && \
169+
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
145170
},
146171
VolumeMounts: []corev1.VolumeMount{
147172
{
@@ -152,14 +177,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
152177
Name: "tmp-volume",
153178
MountPath: "/tmp",
154179
},
155-
{
156-
Name: "output-volume",
157-
MountPath: "/mnt/output",
158-
},
159180
},
160181
Resources: corev1.ResourceRequirements{
182+
Requests: corev1.ResourceList{
183+
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
184+
corev1.ResourceMemory: resource.MustParse("6Gi"),
185+
},
161186
Limits: corev1.ResourceList{
162-
corev1.ResourceCPU: resource.MustParse("1"),
187+
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
163188
corev1.ResourceMemory: resource.MustParse("6Gi"),
164189
},
165190
},
@@ -182,26 +207,19 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
182207
EmptyDir: &corev1.EmptyDirVolumeSource{},
183208
},
184209
},
185-
{
186-
Name: "output-volume",
187-
VolumeSource: corev1.VolumeSource{
188-
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
189-
ClaimName: outputPvcName,
190-
},
191-
},
192-
},
193210
},
194211
RestartPolicy: corev1.RestartPolicyOnFailure,
195212
},
196213
},
197214
},
198-
"Worker": {
215+
kftov1.PyTorchJobReplicaTypeWorker: {
199216
Replicas: Ptr(int32(workerReplicas)),
200217
RestartPolicy: kftov1.RestartPolicyOnFailure,
201218
Template: corev1.PodTemplateSpec{
202219
ObjectMeta: metav1.ObjectMeta{
203220
Labels: map[string]string{
204-
"app": "kfto-mnist",
221+
"app": "kfto-mnist",
222+
"role": "worker",
205223
},
206224
},
207225
Spec: corev1.PodSpec{
@@ -226,9 +244,14 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
226244
ImagePullPolicy: corev1.PullIfNotPresent,
227245
Command: []string{
228246
"/bin/bash", "-c",
229-
fmt.Sprintf(`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
247+
fmt.Sprintf(`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
230248
pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
231-
python /mnt/files/mnist.py --epochs 3 --save-model --backend %s`, backend),
249+
echo "Downloading MNIST dataset..." && \
250+
python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
251+
MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
252+
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
253+
echo -e "\n\n Starting training..." && \
254+
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
232255
},
233256
VolumeMounts: []corev1.VolumeMount{
234257
{
@@ -241,8 +264,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
241264
},
242265
},
243266
Resources: corev1.ResourceRequirements{
267+
Requests: corev1.ResourceList{
268+
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
269+
corev1.ResourceMemory: resource.MustParse("6Gi"),
270+
},
244271
Limits: corev1.ResourceList{
245-
corev1.ResourceCPU: resource.MustParse("1"),
272+
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
246273
corev1.ResourceMemory: resource.MustParse("6Gi"),
247274
},
248275
},
@@ -274,34 +301,44 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
274301
},
275302
}
276303

277-
if useGPU {
304+
if accelerator.isGpu() {
278305
// Update resource lists for GPU (NVIDIA/ROCm) usecase
279-
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(gpuLabel)] = resource.MustParse(fmt.Sprint(numGpus))
280-
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(gpuLabel)] = resource.MustParse(fmt.Sprint(numGpus))
306+
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
307+
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
308+
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
309+
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(accelerator.ResourceLabel)] = resource.MustParse(fmt.Sprint(numProcPerNode))
281310

282311
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Env = []corev1.EnvVar{
283312
{
284313
Name: "NCCL_DEBUG",
285314
Value: "INFO",
286315
},
316+
{
317+
Name: "TORCH_DISTRIBUTED_DEBUG",
318+
Value: "DETAIL",
319+
},
287320
}
288321
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Env = []corev1.EnvVar{
289322
{
290323
Name: "NCCL_DEBUG",
291324
Value: "INFO",
292325
},
326+
{
327+
Name: "TORCH_DISTRIBUTED_DEBUG",
328+
Value: "DETAIL",
329+
},
293330
}
294331

295332
// Update tolerations
296333
tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Tolerations = []corev1.Toleration{
297334
{
298-
Key: gpuLabel,
335+
Key: accelerator.ResourceLabel,
299336
Operator: corev1.TolerationOpExists,
300337
},
301338
}
302339
tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Tolerations = []corev1.Toleration{
303340
{
304-
Key: gpuLabel,
341+
Key: accelerator.ResourceLabel,
305342
Operator: corev1.TolerationOpExists,
306343
},
307344
}

tests/kfto/kfto_training_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func TestPyTorchJobMultiNodeMultiGpuWithROCm(t *testing.T) {
6262
runKFTOPyTorchJob(t, GetROCmTrainingImage(), AMD, 2, 1)
6363
}
6464

65-
func runKFTOPyTorchJob(t *testing.T, image string, gpu Gpu, numGpus, numberOfWorkerNodes int) {
65+
func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, numberOfWorkerNodes int) {
6666
test := With(t)
6767

6868
// Create a namespace
@@ -98,7 +98,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Gpu, numGpus, numberOfWor
9898
And(
9999
HaveLen(numGpus),
100100
ContainElement(
101-
// Check that at lest some GPU was utilized on more than 50%
101+
// Check that at least some GPU was utilized on more than 50%
102102
HaveField("Value", BeNumerically(">", 50)),
103103
),
104104
),
@@ -112,7 +112,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Gpu, numGpus, numberOfWor
112112
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
113113
}
114114

115-
func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap, gpu Gpu, numGpus, numberOfWorkerNodes int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
115+
func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap, gpu Accelerator, numGpus, numberOfWorkerNodes int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
116116
tuningJob := &kftov1.PyTorchJob{
117117
TypeMeta: metav1.TypeMeta{
118118
APIVersion: corev1.SchemeGroupVersion.String(),

0 commit comments

Comments
 (0)