Skip to content

Commit fe53b3f

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 23954e4 + 5e94d5e commit fe53b3f

File tree

3 files changed

+49
-9
lines changed

3 files changed

+49
-9
lines changed

examples/stable-diffusion-dreambooth/yaml/operators/nvidia.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ metadata:
2525
name: gpu-operator-certified
2626
namespace: nvidia-gpu-operator
2727
spec:
28-
channel: v23.6
28+
channel: stable
2929
installPlanApproval: Automatic
3030
name: gpu-operator-certified
3131
source: certified-operators

tests/kfto/core/config_qlora.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"model_name_or_path": "/tmp/model/bloom-560m",
3+
"training_data_path": "/etc/config/twitter_complaints_small.json",
4+
"output_dir": "/tmp/out",
5+
"save_model_dir": "/tmp/out",
6+
"num_train_epochs": 1.0,
7+
"per_device_train_batch_size": 4,
8+
"per_device_eval_batch_size": 4,
9+
"gradient_accumulation_steps": 4,
10+
"save_strategy": "no",
11+
"learning_rate": 1e-4,
12+
"weight_decay": 0.0,
13+
"lr_scheduler_type": "cosine",
14+
"include_tokens_per_second": true,
15+
"response_template": "\n### Label:",
16+
"dataset_text_field": "output",
17+
"use_flash_attn": false,
18+
"peft_method": "lora",
19+
"quantized_lora_config": {
20+
"auto_gptq": ["triton_v2"]
21+
},
22+
"torch_dtype": "float16",
23+
"fp16": true
24+
}

tests/kfto/core/kfto_kueue_sft_test.go

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package core
1818

1919
import (
20+
"fmt"
2021
"testing"
2122

2223
. "github.com/onsi/gomega"
@@ -31,14 +32,17 @@ import (
3132
)
3233

3334
func TestPytorchjobWithSFTtrainerFinetuning(t *testing.T) {
34-
runPytorchjobWithSFTtrainer(t, "config.json")
35+
runPytorchjobWithSFTtrainer(t, "config.json", 0)
3536
}
3637

3738
func TestPytorchjobWithSFTtrainerLoRa(t *testing.T) {
38-
runPytorchjobWithSFTtrainer(t, "config_lora.json")
39+
runPytorchjobWithSFTtrainer(t, "config_lora.json", 0)
40+
}
41+
func TestPytorchjobWithSFTtrainerQLoRa(t *testing.T) {
42+
runPytorchjobWithSFTtrainer(t, "config_qlora.json", 1)
3943
}
4044

41-
func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
45+
func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus int) {
4246
test := With(t)
4347

4448
// Create a namespace
@@ -58,7 +62,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
5862
NamespaceSelector: &metav1.LabelSelector{},
5963
ResourceGroups: []kueuev1beta1.ResourceGroup{
6064
{
61-
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
65+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
6266
Flavors: []kueuev1beta1.FlavorQuotas{
6367
{
6468
Name: kueuev1beta1.ResourceFlavorReference(resourceFlavor.Name),
@@ -71,6 +75,10 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
7175
Name: corev1.ResourceMemory,
7276
NominalQuota: resource.MustParse("12Gi"),
7377
},
78+
{
79+
Name: corev1.ResourceName("nvidia.com/gpu"),
80+
NominalQuota: resource.MustParse(fmt.Sprint(numGpus)),
81+
},
7482
},
7583
},
7684
},
@@ -82,7 +90,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
8290
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8391

8492
// Create training PyTorch job
85-
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
93+
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config, numGpus)
8694

8795
// Make sure the Kueue Workload is admitted
8896
test.Eventually(KueueWorkloads(test, namespace.Name), TestTimeoutLong).
@@ -146,14 +154,14 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
146154
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
147155

148156
// Create first training PyTorch job
149-
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
157+
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config, 0)
150158

151159
// Make sure the PyTorch job is running
152160
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).
153161
Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
154162

155163
// Create second training PyTorch job
156-
secondTuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
164+
secondTuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config, 0)
157165

158166
// Make sure the second PyTorch job is suspended, waiting for first job to finish
159167
test.Eventually(PytorchJob(test, namespace.Name, secondTuningJob.Name), TestTimeoutShort).
@@ -172,7 +180,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
172180
test.T().Logf("PytorchJob %s/%s ran successfully", secondTuningJob.Namespace, secondTuningJob.Name)
173181
}
174182

175-
func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
183+
func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap, numGpus int) *kftov1.PyTorchJob {
176184
tuningJob := &kftov1.PyTorchJob{
177185
TypeMeta: metav1.TypeMeta{
178186
APIVersion: corev1.SchemeGroupVersion.String(),
@@ -191,6 +199,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
191199
RestartPolicy: "OnFailure",
192200
Template: corev1.PodTemplateSpec{
193201
Spec: corev1.PodSpec{
202+
Tolerations: []corev1.Toleration{
203+
{
204+
Key: "nvidia.com/gpu",
205+
Operator: corev1.TolerationOpExists,
206+
},
207+
},
194208
InitContainers: []corev1.Container{
195209
{
196210
Name: "copy-model",
@@ -235,10 +249,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
235249
Requests: corev1.ResourceList{
236250
corev1.ResourceCPU: resource.MustParse("2"),
237251
corev1.ResourceMemory: resource.MustParse("7Gi"),
252+
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numGpus)),
238253
},
239254
Limits: corev1.ResourceList{
240255
corev1.ResourceCPU: resource.MustParse("2"),
241256
corev1.ResourceMemory: resource.MustParse("7Gi"),
257+
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numGpus)),
242258
},
243259
},
244260
SecurityContext: &corev1.SecurityContext{

0 commit comments

Comments
 (0)