@@ -29,19 +29,26 @@ import (
29
29
. "github.com/onsi/gomega"
30
30
31
31
corev1 "k8s.io/api/core/v1"
32
+ "k8s.io/apimachinery/pkg/api/resource"
32
33
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34
+ "sigs.k8s.io/kueue/apis/kueue/v1beta1"
33
35
34
36
. "github.com/opendatahub-io/distributed-workloads/tests/common"
35
37
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
36
38
"github.com/opendatahub-io/distributed-workloads/tests/odh"
37
39
)
38
40
39
- func TestKftoSftLlmLlama3_1_8BInstruct (t * testing.T ) {
41
+ func TestKftoSftLlmLlama3_1_8BInstructWithCudaPyTorch251 (t * testing.T ) {
40
42
Tags (t , KftoCuda )
41
- kftoSftLlm (t , "meta-llama/Llama-3.1-8B-Instruct" )
43
+ kftoSftLlm (t , GetTrainingCudaPyTorch251Image (), NVIDIA , "meta-llama/Llama-3.1-8B-Instruct" )
42
44
}
43
45
44
- func kftoSftLlm (t * testing.T , modelName string ) {
46
+ func TestKftoSftLlmLlama3_1_8BInstructWithROCmPyTorch251 (t * testing.T ) {
47
+ Tags (t , KftoRocm )
48
+ kftoSftLlm (t , GetTrainingROCmPyTorch251Image (), AMD , "meta-llama/Llama-3.1-8B-Instruct" )
49
+ }
50
+
51
+ func kftoSftLlm (t * testing.T , image string , gpu Accelerator , modelName string ) {
45
52
test := With (t )
46
53
47
54
// Create a namespace
@@ -66,28 +73,66 @@ func kftoSftLlm(t *testing.T, modelName string) {
66
73
// Create PVC for Notebook
67
74
notebookPVC := CreatePersistentVolumeClaim (test , namespace .Name , "500Gi" , AccessModes (corev1 .ReadWriteMany ), StorageClassName (storageClass .Name ))
68
75
76
+ // Create Kueue resources
77
+ resourceFlavor := CreateKueueResourceFlavor (test , v1beta1.ResourceFlavorSpec {})
78
+ defer test .Client ().Kueue ().KueueV1beta1 ().ResourceFlavors ().Delete (test .Ctx (), resourceFlavor .Name , metav1.DeleteOptions {})
79
+ cqSpec := v1beta1.ClusterQueueSpec {
80
+ NamespaceSelector : & metav1.LabelSelector {},
81
+ ResourceGroups : []v1beta1.ResourceGroup {
82
+ {
83
+ CoveredResources : []corev1.ResourceName {corev1 .ResourceName ("cpu" ), corev1 .ResourceName ("memory" ), corev1 .ResourceName (gpu .ResourceLabel )},
84
+ Flavors : []v1beta1.FlavorQuotas {
85
+ {
86
+ Name : v1beta1 .ResourceFlavorReference (resourceFlavor .Name ),
87
+ Resources : []v1beta1.ResourceQuota {
88
+ {
89
+ Name : corev1 .ResourceCPU ,
90
+ NominalQuota : resource .MustParse ("32" ),
91
+ },
92
+ {
93
+ Name : corev1 .ResourceMemory ,
94
+ NominalQuota : resource .MustParse ("512Gi" ),
95
+ },
96
+ {
97
+ Name : corev1 .ResourceName (gpu .ResourceLabel ),
98
+ NominalQuota : resource .MustParse ("8" ),
99
+ },
100
+ },
101
+ },
102
+ },
103
+ },
104
+ },
105
+ }
106
+
107
+ clusterQueue := CreateKueueClusterQueue (test , cqSpec )
108
+ defer test .Client ().Kueue ().KueueV1beta1 ().ClusterQueues ().Delete (test .Ctx (), clusterQueue .Name , metav1.DeleteOptions {})
109
+ localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
110
+
69
111
// Read and update the notebook content
70
112
notebookContent := odh .ReadFileExt (test , workingDirectory + "/../../examples/kfto-sft-llm/sft.ipynb" )
71
113
updatedNotebookContent := string (notebookContent )
72
114
73
115
// Update notebook parameters for testing
74
116
requiredChangesInNotebook := map [string ]string {
75
117
"model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct" : fmt .Sprintf ("model_name_or_path: %s" , modelName ),
76
- "num_train_epochs: 10" : "num_train_epochs: 1" ,
77
- "output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct" : fmt .Sprintf ("output_dir: /mnt/shared/%s" , modelName ),
78
- "api_server = \\ \" <API_SERVER>\\ \" " : fmt .Sprintf ("api_server = \\ \" %s\\ \" " , GetOpenShiftApiUrl (test )),
79
- "token = \\ \" <TOKEN>\\ \" " : fmt .Sprintf ("token = \\ \" %s\\ \" " , userToken ),
80
- "#configuration.verify_ssl = False" : "configuration.verify_ssl = False" ,
81
- "name=\\ \" sft\\ \" " : fmt .Sprintf ("name=\\ \" sft-%s\\ \" " , namespace .Name ),
82
- "\" HF_TOKEN\\ \" : \\ \" \\ \" " : fmt .Sprintf ("\" HF_TOKEN\\ \" : \\ \" %s\\ \" " , hfToken ),
83
- "claim_name=\\ \" shared\\ \" " : fmt .Sprintf ("claim_name=\\ \" %s\\ \" " , notebookPVC .Name ),
84
- "eval_strategy: epoch" : "eval_strategy: 'no'" ,
85
- "logging_steps: 1" : "logging_steps: 10" ,
86
- "\" client.get_job_logs(\\ n\" ," : "\" client.wait_for_job_conditions(\\ n\" ," ,
87
- "\" follow=True,\\ n\" ," : "\" wait_timeout=1800,\\ n\" ,\n \t \" polling_interval=60,\\ n\" ," ,
88
- "os.environ[\\ \" TENSORBOARD_PROXY_URL\\ \" ]" : "#os.environ[\\ \" TENSORBOARD_PROXY_URL\\ \" ]" ,
89
- "%load_ext tensorboard" : "#%load_ext tensorboard" ,
90
- "%tensorboard --logdir /opt/app-root/src/shared" : "#%tensorboard --logdir /opt/app-root/src/shared" ,
118
+ "num_train_epochs: 10" : "num_train_epochs: 1" ,
119
+ "eval_strategy: epoch" : "eval_strategy: 'no'" ,
120
+ "logging_steps: 1" : "logging_steps: 10" ,
121
+ "output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct" : fmt .Sprintf ("output_dir: /mnt/shared/%s" , modelName ),
122
+ "api_server = \\ \" <API_SERVER>\\ \" " : fmt .Sprintf ("api_server = \\ \" %s\\ \" " , GetOpenShiftApiUrl (test )),
123
+ "token = \\ \" <TOKEN>\\ \" " : fmt .Sprintf ("token = \\ \" %s\\ \" " , userToken ),
124
+ "#configuration.verify_ssl = False" : "configuration.verify_ssl = False" ,
125
+ "name=\\ \" sft\\ \" " : fmt .Sprintf ("name=\\ \" sft-%s\\ \" " , namespace .Name ),
126
+ "train_func=main," : fmt .Sprintf ("labels= {\\ n\" ,\n \t \" \\ \" kueue.x-k8s.io/queue-name\\ \" : \\ \" %s\\ \" \\ n\" ,\n \t \" },\\ n\" ,\n \t \" train_func=main," , localQueue .Name ),
127
+ " \\ \" nvidia.com/gpu\\ \" " : fmt .Sprintf (" \\ \" %s\\ \" " , gpu .ResourceLabel ),
128
+ "base_image=\\ \" quay.io/modh/training:py311-cuda124-torch251\\ \" " : fmt .Sprintf ("base_image=\\ \" %s\\ \" " , image ),
129
+ "\" HF_TOKEN\\ \" : \\ \" \\ \" " : fmt .Sprintf ("\" HF_TOKEN\\ \" : \\ \" %s\\ \" " , hfToken ),
130
+ "claim_name=\\ \" shared\\ \" " : fmt .Sprintf ("claim_name=\\ \" %s\\ \" " , notebookPVC .Name ),
131
+ "\" client.get_job_logs(\\ n\" ," : "\" client.wait_for_job_conditions(\\ n\" ," ,
132
+ "\" follow=True,\\ n\" ," : "\" wait_timeout=1800,\\ n\" ,\n \t \" polling_interval=60,\\ n\" ," ,
133
+ "os.environ[\\ \" TENSORBOARD_PROXY_URL\\ \" ]" : "#os.environ[\\ \" TENSORBOARD_PROXY_URL\\ \" ]" ,
134
+ "%load_ext tensorboard" : "#%load_ext tensorboard" ,
135
+ "%tensorboard --logdir /opt/app-root/src/shared" : "#%tensorboard --logdir /opt/app-root/src/shared" ,
91
136
"pretrained_path = \\ \" /opt/app-root/src/shared/.cache/hub/models--Meta-Llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/\\ \" " : "pretrained_path = \\ \" /opt/app-root/src/.cache/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/\\ \" " ,
92
137
"# Test the pre-trained model" : "# Test the pre-trained model\\ n\" ,\n \" from IPython.display import Markdown, display\\ n\" ,\n \" import os" ,
93
138
"display(Markdown(output1))" : "display(Markdown(output1))\\ n\" ,\n \" \\ n\" ,\n \" # Save to file\\ n\" ,\n \" output_path = \\ \" /opt/app-root/src/pretrained_output.md\\ \" \\ n\" ,\n \" os.makedirs(os.path.dirname(output_path), exist_ok=True)\\ n\" ,\n \" with open(output_path, \\ \" w\\ \" ) as f:\\ n\" ,\n \t \" f.write(output1)" ,
@@ -128,7 +173,7 @@ func kftoSftLlm(t *testing.T, modelName string) {
128
173
}
129
174
130
175
// Create Notebook CR
131
- CreateNotebook (test , namespace , userToken , notebookCommand , config .Name , "sft.ipynb" , 1 , notebookPVC )
176
+ CreateNotebook (test , namespace , userToken , notebookCommand , config .Name , "sft.ipynb" , 1 , notebookPVC , ContainerSizeMedium , gpu . ResourceLabel )
132
177
133
178
// Gracefully cleanup Notebook
134
179
defer func () {
@@ -177,7 +222,7 @@ func kftoSftLlm(t *testing.T, modelName string) {
177
222
test .T ().Logf ("Notebook execution completed as indicated by marker file: '%s'" , markerContent )
178
223
return true
179
224
} else if markerContent == "FAILURE" {
180
- errMessage := fmt .Sprintf ("Notebook execution failed as indicated by marker file: '%s'" , markerContent )
225
+ errMessage := fmt .Sprintf ("Notebook execution failed as indicated by marker file '%s': %s" , markerPath , markerContent )
181
226
test .T ().Errorf ("%s" , errMessage )
182
227
notebookExecutionFinalError = fmt .Errorf ("%s" , errMessage )
183
228
// Return true to stop Eventually polling, because a *final* state has been reached
0 commit comments