@@ -18,6 +18,7 @@ package e2e
1818
1919import (
2020 "crypto/tls"
21+ "fmt"
2122 "net/http"
2223 "net/url"
2324 "testing"
@@ -36,9 +37,17 @@ import (
3637
3738// Trains the MNIST dataset as a RayJob, executed by a Ray cluster
3839// directly managed by Kueue, and asserts successful completion of the training job.
39- func TestMNISTRayJobRayCluster (t * testing.T ) {
40+
41+ func TestMnistRayJobRayClusterCpu (t * testing.T ) {
42+ runMnistRayJobRayCluster (t , "cpu" , 0 )
43+ }
44+
45+ func TestMnistRayJobRayClusterGpu (t * testing.T ) {
46+ runMnistRayJobRayCluster (t , "gpu" , 1 )
47+ }
48+
49+ func runMnistRayJobRayCluster (t * testing.T , accelerator string , numberOfGpus int ) {
4050 test := With (t )
41- test .T ().Parallel ()
4251
4352 // Create a namespace and localqueue in that namespace
4453 namespace := test .NewTestNamespace ()
@@ -51,7 +60,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
5160 test .T ().Logf ("Created ConfigMap %s/%s successfully" , mnist .Namespace , mnist .Name )
5261
5362 // Create RayCluster and assign it to the localqueue
54- rayCluster := constructRayCluster (test , namespace , mnist )
63+ rayCluster := constructRayCluster (test , namespace , mnist , numberOfGpus )
5564 AssignToLocalQueue (rayCluster , localQueue )
5665 rayCluster , err = test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).Create (test .Ctx (), rayCluster , metav1.CreateOptions {})
5766 test .Expect (err ).NotTo (HaveOccurred ())
@@ -62,7 +71,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
6271 Should (WithTransform (RayClusterState , Equal (rayv1 .Ready )))
6372
6473 // Create RayJob
65- rayJob := constructRayJob (test , namespace , rayCluster )
74+ rayJob := constructRayJob (test , namespace , rayCluster , accelerator , numberOfGpus )
6675 rayJob , err = test .Client ().Ray ().RayV1 ().RayJobs (namespace .Name ).Create (test .Ctx (), rayJob , metav1.CreateOptions {})
6776 test .Expect (err ).NotTo (HaveOccurred ())
6877 test .T ().Logf ("Created RayJob %s/%s successfully" , rayJob .Namespace , rayJob .Name )
@@ -88,10 +97,17 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
8897 To (WithTransform (RayJobStatus , Equal (rayv1 .JobStatusSucceeded )))
8998}
9099
100+ func TestMnistRayJobRayClusterAppWrapperCpu (t * testing.T ) {
101+ runMnistRayJobRayClusterAppWrapper (t , "cpu" , 0 )
102+ }
103+
104+ func TestMnistRayJobRayClusterAppWrapperGpu (t * testing.T ) {
105+ runMnistRayJobRayClusterAppWrapper (t , "gpu" , 1 )
106+ }
107+
91108// Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper
92- func TestMNISTRayJobRayClusterAppWrapper (t * testing.T ) {
109+ func runMnistRayJobRayClusterAppWrapper (t * testing.T , accelerator string , numberOfGpus int ) {
93110 test := With (t )
94- test .T ().Parallel ()
95111
96112 // Create a namespace and localqueue in that namespace
97113 namespace := test .NewTestNamespace ()
@@ -104,7 +120,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
104120 test .T ().Logf ("Created ConfigMap %s/%s successfully" , mnist .Namespace , mnist .Name )
105121
106122 // Create RayCluster, wrap in AppWrapper and assign to localqueue
107- rayCluster := constructRayCluster (test , namespace , mnist )
123+ rayCluster := constructRayCluster (test , namespace , mnist , numberOfGpus )
108124 aw := & mcadv1beta2.AppWrapper {
109125 TypeMeta : metav1.TypeMeta {
110126 APIVersion : mcadv1beta2 .GroupVersion .String (),
@@ -140,7 +156,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
140156 Should (WithTransform (RayClusterState , Equal (rayv1 .Ready )))
141157
142158 // Create RayJob
143- rayJob := constructRayJob (test , namespace , rayCluster )
159+ rayJob := constructRayJob (test , namespace , rayCluster , accelerator , numberOfGpus )
144160 rayJob , err = test .Client ().Ray ().RayV1 ().RayJobs (namespace .Name ).Create (test .Ctx (), rayJob , metav1.CreateOptions {})
145161 test .Expect (err ).NotTo (HaveOccurred ())
146162 test .T ().Logf ("Created RayJob %s/%s successfully" , rayJob .Namespace , rayJob .Name )
@@ -183,7 +199,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
183199 }
184200}
185201
186- func constructRayCluster (_ Test , namespace * corev1.Namespace , mnist * corev1.ConfigMap ) * rayv1.RayCluster {
202+ func constructRayCluster (_ Test , namespace * corev1.Namespace , mnist * corev1.ConfigMap , numberOfGpus int ) * rayv1.RayCluster {
187203 return & rayv1.RayCluster {
188204 TypeMeta : metav1.TypeMeta {
189205 APIVersion : rayv1 .GroupVersion .String (),
@@ -236,24 +252,6 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
236252 corev1 .ResourceMemory : resource .MustParse ("2G" ),
237253 },
238254 },
239- VolumeMounts : []corev1.VolumeMount {
240- {
241- Name : "mnist" ,
242- MountPath : "/home/ray/jobs" ,
243- },
244- },
245- },
246- },
247- Volumes : []corev1.Volume {
248- {
249- Name : "mnist" ,
250- VolumeSource : corev1.VolumeSource {
251- ConfigMap : & corev1.ConfigMapVolumeSource {
252- LocalObjectReference : corev1.LocalObjectReference {
253- Name : mnist .Name ,
254- },
255- },
256- },
257255 },
258256 },
259257 },
@@ -282,11 +280,31 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
282280 Resources : corev1.ResourceRequirements {
283281 Requests : corev1.ResourceList {
284282 corev1 .ResourceCPU : resource .MustParse ("250m" ),
285- corev1 .ResourceMemory : resource .MustParse ("256Mi" ),
283+ corev1 .ResourceMemory : resource .MustParse ("1G" ),
284+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numberOfGpus )),
286285 },
287286 Limits : corev1.ResourceList {
288- corev1 .ResourceCPU : resource .MustParse ("1" ),
289- corev1 .ResourceMemory : resource .MustParse ("2G" ),
287+ corev1 .ResourceCPU : resource .MustParse ("2" ),
288+ corev1 .ResourceMemory : resource .MustParse ("4G" ),
289+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numberOfGpus )),
290+ },
291+ },
292+ VolumeMounts : []corev1.VolumeMount {
293+ {
294+ Name : "mnist" ,
295+ MountPath : "/home/ray/jobs" ,
296+ },
297+ },
298+ },
299+ },
300+ Volumes : []corev1.Volume {
301+ {
302+ Name : "mnist" ,
303+ VolumeSource : corev1.VolumeSource {
304+ ConfigMap : & corev1.ConfigMapVolumeSource {
305+ LocalObjectReference : corev1.LocalObjectReference {
306+ Name : mnist .Name ,
307+ },
290308 },
291309 },
292310 },
@@ -299,7 +317,7 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
299317 }
300318}
301319
302- func constructRayJob (_ Test , namespace * corev1.Namespace , rayCluster * rayv1.RayCluster ) * rayv1.RayJob {
320+ func constructRayJob (_ Test , namespace * corev1.Namespace , rayCluster * rayv1.RayCluster , accelerator string , numberOfGpus int ) * rayv1.RayJob {
303321 return & rayv1.RayJob {
304322 TypeMeta : metav1.TypeMeta {
305323 APIVersion : rayv1 .GroupVersion .String (),
@@ -320,6 +338,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
320338 MNIST_DATASET_URL: "` + GetMnistDatasetURL () + `"
321339 PIP_INDEX_URL: "` + GetPipIndexURL () + `"
322340 PIP_TRUSTED_HOST: "` + GetPipTrustedHost () + `"
341+ ACCELERATOR: "` + accelerator + `"
323342` ,
324343 ClusterSelector : map [string ]string {
325344 RayJobDefaultClusterSelectorKey : rayCluster .Name ,
@@ -336,6 +355,9 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
336355 },
337356 },
338357 },
358+ EntrypointNumCpus : 2 ,
359+ // Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
360+ EntrypointNumGpus : float32 (numberOfGpus ),
339361 },
340362 }
341363}
0 commit comments