Skip to content

Commit 055a251

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 9673302 + 72f04b0 commit 055a251

File tree

2 files changed

+7
-8
lines changed

2 files changed

+7
-8
lines changed

tests/kfto/kfto_mnist_training_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
334334
python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \
335335
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
336336
echo -e "\n\n Starting training..." && \
337-
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
337+
torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 3 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s`, numProcPerNode, backend),
338338
},
339339
VolumeMounts: []corev1.VolumeMount{
340340
{

tests/kfto/kfto_training_test.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
121121

122122
// Create Kueue resources
123123
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
124-
fmt.Sprintln(gpu.ResourceLabel)
125124
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
126125
cqSpec := v1beta1.ClusterQueueSpec{
127126
NamespaceSelector: &metav1.LabelSelector{},
@@ -191,13 +190,13 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
191190

192191
for _, trainingPod := range trainingPods {
193192
// Check that GPUs for training pods were utilized recently
194-
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, gpu), 15*time.Minute).
193+
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, gpu), 10*time.Minute).
195194
Should(
196195
And(
197196
HaveLen(numGpus),
198197
ContainElement(
199-
// Check that at least some GPU was utilized on more than 50%
200-
HaveField("Value", BeNumerically(">", 50)),
198+
// Check that at least some GPU was utilized on more than 10%
199+
HaveField("Value", BeNumerically(">", 10)),
201200
),
202201
),
203202
)
@@ -206,7 +205,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
206205
}
207206

208207
// Make sure the PyTorch job succeeded
209-
test.Eventually(PyTorchJob(test, namespace, tuningJob.Name), TestTimeoutGpuProvisioning).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
208+
test.Eventually(PyTorchJob(test, namespace, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
210209
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
211210
}
212211

@@ -292,7 +291,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
292291
`torchrun /etc/config/hf_llm_training.py \
293292
--model_uri /tmp/model/bloom-560m \
294293
--model_dir /tmp/model/bloom-560m \
295-
--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
294+
--dataset_file /tmp/all_datasets/alpaca_data_hundredth.json \
296295
--transformer_type AutoModelForCausalLM \
297296
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/tmp/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
298297
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,
@@ -461,7 +460,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
461460
`torchrun /etc/config/hf_llm_training.py \
462461
--model_uri /tmp/model/bloom-560m \
463462
--model_dir /tmp/model/bloom-560m \
464-
--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
463+
--dataset_file /tmp/all_datasets/alpaca_data_hundredth.json \
465464
--transformer_type AutoModelForCausalLM \
466465
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
467466
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,

0 commit comments

Comments
 (0)