Merge remote-tracking branch 'upstream/main'

sutaakar · sutaakar · commit 055a2513fba1 · 2025-07-03T13:55:45.000Z
diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go
@@ -334,7 +334,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
 										python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \
 										echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
 										echo -e "\n\n Starting training..." && \
-										torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
+										torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 3 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
 									},
 									VolumeMounts: []corev1.VolumeMount{
 										{
diff --git a/tests/kfto/kfto_training_test.go b/tests/kfto/kfto_training_test.go
@@ -121,7 +121,6 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
 
 	// Create Kueue resources
 	resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
-	fmt.Sprintln(gpu.ResourceLabel)
 	defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
 	cqSpec := v1beta1.ClusterQueueSpec{
 		NamespaceSelector: &metav1.LabelSelector{},
@@ -191,13 +190,13 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
 
 		for _, trainingPod := range trainingPods {
 			// Check that GPUs for training pods were utilized recently
-			test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, gpu), 15*time.Minute).
+			test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, gpu), 10*time.Minute).
 				Should(
 					And(
 						HaveLen(numGpus),
 						ContainElement(
-							// Check that at least some GPU was utilized on more than 50%
-							HaveField("Value", BeNumerically(">", 50)),
+							// Check that at least some GPU was utilized on more than 10%
+							HaveField("Value", BeNumerically(">", 10)),
 						),
 					),
 				)
@@ -206,7 +205,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
 	}
 
 	// Make sure the PyTorch job succeeded
-	test.Eventually(PyTorchJob(test, namespace, tuningJob.Name), TestTimeoutGpuProvisioning).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
+	test.Eventually(PyTorchJob(test, namespace, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
 	test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
 }
 
@@ -292,7 +291,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
 										`torchrun /etc/config/hf_llm_training.py \
 										--model_uri /tmp/model/bloom-560m \
 										--model_dir /tmp/model/bloom-560m \
-										--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
+										--dataset_file /tmp/all_datasets/alpaca_data_hundredth.json \
 										--transformer_type AutoModelForCausalLM \
 										--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/tmp/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
 										--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,
@@ -461,7 +460,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
 								`torchrun /etc/config/hf_llm_training.py \
 							--model_uri /tmp/model/bloom-560m \
 							--model_dir /tmp/model/bloom-560m \
-							--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
+							--dataset_file /tmp/all_datasets/alpaca_data_hundredth.json \
 							--transformer_type AutoModelForCausalLM \
 							--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
 							--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,

Original file line number	Diff line number	Diff line change
`@@ -334,7 +334,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config`
`334`	`334`	`python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \`
`335`	`335`	`echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \`
`336`	`336`	`echo -e "\n\n Starting training..." && \`
`337`		- torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
	`337`	+ torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 3 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
`338`	`338`	`},`
`339`	`339`	`VolumeMounts: []corev1.VolumeMount{`
`340`	`340`	`{`