@@ -121,7 +121,6 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
121
121
122
122
// Create Kueue resources
123
123
resourceFlavor := CreateKueueResourceFlavor (test , v1beta1.ResourceFlavorSpec {})
124
- fmt .Sprintln (gpu .ResourceLabel )
125
124
defer test .Client ().Kueue ().KueueV1beta1 ().ResourceFlavors ().Delete (test .Ctx (), resourceFlavor .Name , metav1.DeleteOptions {})
126
125
cqSpec := v1beta1.ClusterQueueSpec {
127
126
NamespaceSelector : & metav1.LabelSelector {},
@@ -191,13 +190,13 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
191
190
192
191
for _ , trainingPod := range trainingPods {
193
192
// Check that GPUs for training pods were utilized recently
194
- test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , gpu ), 15 * time .Minute ).
193
+ test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , gpu ), 10 * time .Minute ).
195
194
Should (
196
195
And (
197
196
HaveLen (numGpus ),
198
197
ContainElement (
199
- // Check that at least some GPU was utilized on more than 50 %
200
- HaveField ("Value" , BeNumerically (">" , 50 )),
198
+ // Check that at least some GPU was utilized on more than 10 %
199
+ HaveField ("Value" , BeNumerically (">" , 10 )),
201
200
),
202
201
),
203
202
)
@@ -206,7 +205,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
206
205
}
207
206
208
207
// Make sure the PyTorch job succeeded
209
- test .Eventually (PyTorchJob (test , namespace , tuningJob .Name ), TestTimeoutGpuProvisioning ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
208
+ test .Eventually (PyTorchJob (test , namespace , tuningJob .Name ), TestTimeoutLong ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
210
209
test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
211
210
}
212
211
@@ -292,7 +291,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
292
291
`torchrun /etc/config/hf_llm_training.py \
293
292
--model_uri /tmp/model/bloom-560m \
294
293
--model_dir /tmp/model/bloom-560m \
295
- --dataset_file /tmp/all_datasets/alpaca_data_tenth .json \
294
+ --dataset_file /tmp/all_datasets/alpaca_data_hundredth .json \
296
295
--transformer_type AutoModelForCausalLM \
297
296
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/tmp/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
298
297
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
@@ -461,7 +460,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
461
460
`torchrun /etc/config/hf_llm_training.py \
462
461
--model_uri /tmp/model/bloom-560m \
463
462
--model_dir /tmp/model/bloom-560m \
464
- --dataset_file /tmp/all_datasets/alpaca_data_tenth .json \
463
+ --dataset_file /tmp/all_datasets/alpaca_data_hundredth .json \
465
464
--transformer_type AutoModelForCausalLM \
466
465
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
467
466
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
0 commit comments