@@ -17,6 +17,7 @@ limitations under the License.
17
17
package core
18
18
19
19
import (
20
+ "fmt"
20
21
"testing"
21
22
22
23
. "github.com/onsi/gomega"
@@ -31,17 +32,17 @@ import (
31
32
)
32
33
33
34
func TestPytorchjobWithSFTtrainerFinetuning (t * testing.T ) {
34
- runPytorchjobWithSFTtrainer (t , "config.json" )
35
+ runPytorchjobWithSFTtrainer (t , "config.json" , 0 )
35
36
}
36
37
37
38
func TestPytorchjobWithSFTtrainerLoRa (t * testing.T ) {
38
- runPytorchjobWithSFTtrainer (t , "config_lora.json" )
39
+ runPytorchjobWithSFTtrainer (t , "config_lora.json" , 0 )
39
40
}
40
41
func TestPytorchjobWithSFTtrainerQLoRa (t * testing.T ) {
41
- runPytorchjobWithSFTtrainer (t , "config_qlora.json" )
42
+ runPytorchjobWithSFTtrainer (t , "config_qlora.json" , 1 )
42
43
}
43
44
44
- func runPytorchjobWithSFTtrainer (t * testing.T , modelConfigFile string ) {
45
+ func runPytorchjobWithSFTtrainer (t * testing.T , modelConfigFile string , numGpus int ) {
45
46
test := With (t )
46
47
47
48
// Create a namespace
@@ -61,7 +62,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
61
62
NamespaceSelector : & metav1.LabelSelector {},
62
63
ResourceGroups : []kueuev1beta1.ResourceGroup {
63
64
{
64
- CoveredResources : []corev1.ResourceName {corev1 .ResourceName ("cpu" ), corev1 .ResourceName ("memory" )},
65
+ CoveredResources : []corev1.ResourceName {corev1 .ResourceName ("cpu" ), corev1 .ResourceName ("memory" ), corev1 . ResourceName ( "nvidia.com/gpu" ) },
65
66
Flavors : []kueuev1beta1.FlavorQuotas {
66
67
{
67
68
Name : kueuev1beta1 .ResourceFlavorReference (resourceFlavor .Name ),
@@ -74,6 +75,10 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
74
75
Name : corev1 .ResourceMemory ,
75
76
NominalQuota : resource .MustParse ("12Gi" ),
76
77
},
78
+ {
79
+ Name : corev1 .ResourceName ("nvidia.com/gpu" ),
80
+ NominalQuota : resource .MustParse (fmt .Sprint (numGpus )),
81
+ },
77
82
},
78
83
},
79
84
},
@@ -85,7 +90,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
85
90
localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
86
91
87
92
// Create training PyTorch job
88
- tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config )
93
+ tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , numGpus )
89
94
90
95
// Make sure the Kueue Workload is admitted
91
96
test .Eventually (KueueWorkloads (test , namespace .Name ), TestTimeoutLong ).
@@ -149,14 +154,14 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
149
154
localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
150
155
151
156
// Create first training PyTorch job
152
- tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config )
157
+ tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , 0 )
153
158
154
159
// Make sure the PyTorch job is running
155
160
test .Eventually (PytorchJob (test , namespace .Name , tuningJob .Name ), TestTimeoutLong ).
156
161
Should (WithTransform (PytorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
157
162
158
163
// Create second training PyTorch job
159
- secondTuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config )
164
+ secondTuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , 0 )
160
165
161
166
// Make sure the second PyTorch job is suspended, waiting for first job to finish
162
167
test .Eventually (PytorchJob (test , namespace .Name , secondTuningJob .Name ), TestTimeoutShort ).
@@ -175,7 +180,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
175
180
test .T ().Logf ("PytorchJob %s/%s ran successfully" , secondTuningJob .Namespace , secondTuningJob .Name )
176
181
}
177
182
178
- func createPyTorchJob (test Test , namespace , localQueueName string , config corev1.ConfigMap ) * kftov1.PyTorchJob {
183
+ func createPyTorchJob (test Test , namespace , localQueueName string , config corev1.ConfigMap , numGpus int ) * kftov1.PyTorchJob {
179
184
tuningJob := & kftov1.PyTorchJob {
180
185
TypeMeta : metav1.TypeMeta {
181
186
APIVersion : corev1 .SchemeGroupVersion .String (),
@@ -194,6 +199,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
194
199
RestartPolicy : "OnFailure" ,
195
200
Template : corev1.PodTemplateSpec {
196
201
Spec : corev1.PodSpec {
202
+ Tolerations : []corev1.Toleration {
203
+ {
204
+ Key : "nvidia.com/gpu" ,
205
+ Operator : corev1 .TolerationOpExists ,
206
+ },
207
+ },
197
208
InitContainers : []corev1.Container {
198
209
{
199
210
Name : "copy-model" ,
@@ -238,10 +249,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
238
249
Requests : corev1.ResourceList {
239
250
corev1 .ResourceCPU : resource .MustParse ("2" ),
240
251
corev1 .ResourceMemory : resource .MustParse ("7Gi" ),
252
+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numGpus )),
241
253
},
242
254
Limits : corev1.ResourceList {
243
255
corev1 .ResourceCPU : resource .MustParse ("2" ),
244
256
corev1 .ResourceMemory : resource .MustParse ("7Gi" ),
257
+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numGpus )),
245
258
},
246
259
},
247
260
SecurityContext : & corev1.SecurityContext {
0 commit comments