@@ -17,6 +17,7 @@ limitations under the License.
17
17
package core
18
18
19
19
import (
20
+ "fmt"
20
21
"testing"
21
22
22
23
. "github.com/onsi/gomega"
@@ -31,14 +32,17 @@ import (
31
32
)
32
33
33
34
func TestPytorchjobWithSFTtrainerFinetuning (t * testing.T ) {
34
- runPytorchjobWithSFTtrainer (t , "config.json" )
35
+ runPytorchjobWithSFTtrainer (t , "config.json" , 0 )
35
36
}
36
37
37
38
func TestPytorchjobWithSFTtrainerLoRa (t * testing.T ) {
38
- runPytorchjobWithSFTtrainer (t , "config_lora.json" )
39
+ runPytorchjobWithSFTtrainer (t , "config_lora.json" , 0 )
40
+ }
41
+ func TestPytorchjobWithSFTtrainerQLoRa (t * testing.T ) {
42
+ runPytorchjobWithSFTtrainer (t , "config_qlora.json" , 1 )
39
43
}
40
44
41
- func runPytorchjobWithSFTtrainer (t * testing.T , modelConfigFile string ) {
45
+ func runPytorchjobWithSFTtrainer (t * testing.T , modelConfigFile string , numGpus int ) {
42
46
test := With (t )
43
47
44
48
// Create a namespace
@@ -58,7 +62,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
58
62
NamespaceSelector : & metav1.LabelSelector {},
59
63
ResourceGroups : []kueuev1beta1.ResourceGroup {
60
64
{
61
- CoveredResources : []corev1.ResourceName {corev1 .ResourceName ("cpu" ), corev1 .ResourceName ("memory" )},
65
+ CoveredResources : []corev1.ResourceName {corev1 .ResourceName ("cpu" ), corev1 .ResourceName ("memory" ), corev1 . ResourceName ( "nvidia.com/gpu" ) },
62
66
Flavors : []kueuev1beta1.FlavorQuotas {
63
67
{
64
68
Name : kueuev1beta1 .ResourceFlavorReference (resourceFlavor .Name ),
@@ -71,6 +75,10 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
71
75
Name : corev1 .ResourceMemory ,
72
76
NominalQuota : resource .MustParse ("12Gi" ),
73
77
},
78
+ {
79
+ Name : corev1 .ResourceName ("nvidia.com/gpu" ),
80
+ NominalQuota : resource .MustParse (fmt .Sprint (numGpus )),
81
+ },
74
82
},
75
83
},
76
84
},
@@ -82,7 +90,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
82
90
localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
83
91
84
92
// Create training PyTorch job
85
- tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config )
93
+ tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , numGpus )
86
94
87
95
// Make sure the Kueue Workload is admitted
88
96
test .Eventually (KueueWorkloads (test , namespace .Name ), TestTimeoutLong ).
@@ -146,14 +154,14 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
146
154
localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
147
155
148
156
// Create first training PyTorch job
149
- tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config )
157
+ tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , 0 )
150
158
151
159
// Make sure the PyTorch job is running
152
160
test .Eventually (PytorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutLong ).
153
161
Should (WithTransform (PytorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
154
162
155
163
// Create second training PyTorch job
156
- secondTuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config )
164
+ secondTuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , 0 )
157
165
158
166
// Make sure the second PyTorch job is suspended, waiting for first job to finish
159
167
test .Eventually (PytorchJob (test , namespace .Name , secondTuningJob .Name ), TestTimeoutShort ).
@@ -172,7 +180,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
172
180
test .T ().Logf ("PytorchJob %s/%s ran successfully" , secondTuningJob .Namespace , secondTuningJob .Name )
173
181
}
174
182
175
- func createPyTorchJob (test Test , namespace , localQueueName string , config corev1.ConfigMap ) * kftov1.PyTorchJob {
183
+ func createPyTorchJob (test Test , namespace , localQueueName string , config corev1.ConfigMap , numGpus int ) * kftov1.PyTorchJob {
176
184
tuningJob := & kftov1.PyTorchJob {
177
185
TypeMeta : metav1.TypeMeta {
178
186
APIVersion : corev1 .SchemeGroupVersion .String (),
@@ -191,6 +199,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
191
199
RestartPolicy : "OnFailure" ,
192
200
Template : corev1.PodTemplateSpec {
193
201
Spec : corev1.PodSpec {
202
+ Tolerations : []corev1.Toleration {
203
+ {
204
+ Key : "nvidia.com/gpu" ,
205
+ Operator : corev1 .TolerationOpExists ,
206
+ },
207
+ },
194
208
InitContainers : []corev1.Container {
195
209
{
196
210
Name : "copy-model" ,
@@ -235,10 +249,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
235
249
Requests : corev1.ResourceList {
236
250
corev1 .ResourceCPU : resource .MustParse ("2" ),
237
251
corev1 .ResourceMemory : resource .MustParse ("7Gi" ),
252
+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numGpus )),
238
253
},
239
254
Limits : corev1.ResourceList {
240
255
corev1 .ResourceCPU : resource .MustParse ("2" ),
241
256
corev1 .ResourceMemory : resource .MustParse ("7Gi" ),
257
+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numGpus )),
242
258
},
243
259
},
244
260
SecurityContext : & corev1.SecurityContext {
0 commit comments