Skip to content

Commit 5ca12be

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 5109100 + 291fc83 commit 5ca12be

File tree

8 files changed

+152
-40
lines changed

8 files changed

+152
-40
lines changed

OWNERS

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,25 @@
11
approvers:
2-
- anishasthana
32
- astefanutti
3+
- Bobbins228
44
- dimakis
5-
- jbusche
5+
- Fiona-Waters
66
- jiripetrlik
77
- kpostoffice
8-
- MichaelClifford
98
- sutaakar
10-
- tedhtchang
9+
- varshaprasad96
1110
reviewers:
11+
- abhijeet-dhumal
1212
- astefanutti
13+
- Bobbins228
14+
- ChughShilpa
1315
- dimakis
14-
- jbusche
16+
- Fiona-Waters
1517
- jiripetrlik
1618
- kpostoffice
17-
- MichaelClifford
1819
- sutaakar
19-
- tedhtchang
20+
- varshaprasad96
21+
emeritus_approvers:
22+
- anishasthana # 2024-09-04
23+
- jbusche # 2024-09-04
24+
- MichaelClifford # 2024-09-04
25+
- tedhtchang # 2024-09-04

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ toolchain go1.21.5
77
require (
88
github.com/kubeflow/training-operator v1.7.0
99
github.com/onsi/gomega v1.31.1
10-
github.com/project-codeflare/codeflare-common v0.0.0-20240827080155-9234d23ff47d
10+
github.com/project-codeflare/codeflare-common v0.0.0-20240905061421-272c8b361325
1111
github.com/prometheus/client_golang v1.18.0
1212
github.com/prometheus/common v0.45.0
1313
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0

go.sum

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,10 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
363363
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
364364
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
365365
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
366-
github.com/project-codeflare/codeflare-common v0.0.0-20240827080155-9234d23ff47d h1:hbfF20rw/NHvXNXYLuxPjCnBS5Lotvt6rU0S9DLs0HU=
367-
github.com/project-codeflare/codeflare-common v0.0.0-20240827080155-9234d23ff47d/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
366+
github.com/project-codeflare/codeflare-common v0.0.0-20240829135229-4225111c401e h1:ozVW0icXLDJcKigc3Bi/4Fc4UMXMxESO53msCLameWw=
367+
github.com/project-codeflare/codeflare-common v0.0.0-20240829135229-4225111c401e/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
368+
github.com/project-codeflare/codeflare-common v0.0.0-20240905061421-272c8b361325 h1:77e1P1PFUizHP9V/DiIa19Mp/ITPH8z4d4zirAcnFY8=
369+
github.com/project-codeflare/codeflare-common v0.0.0-20240905061421-272c8b361325/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
368370
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
369371
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
370372
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

tests/odh/mnist_ray_test.go

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ package odh
1919
import (
2020
"bytes"
2121
"fmt"
22+
"log"
23+
"os/exec"
2224
"testing"
2325
"time"
2426

@@ -33,19 +35,50 @@ import (
3335
)
3436

3537
func TestMnistRayCpu(t *testing.T) {
36-
mnistRay(t, 0)
38+
mnistDefaultRayImage(t, 0)
3739
}
3840

3941
func TestMnistRayGpu(t *testing.T) {
40-
mnistRay(t, 1)
42+
mnistDefaultRayImage(t, 1)
4143
}
4244

43-
func mnistRay(t *testing.T, numGpus int) {
45+
func TestMnistCustomRayImageCpu(t *testing.T) {
46+
mnistCustomRayImage(t, 0)
47+
}
48+
49+
func TestMnistCustomRayImageGpu(t *testing.T) {
50+
mnistCustomRayImage(t, 1)
51+
}
52+
53+
func mnistDefaultRayImage(t *testing.T, numGpus int) {
4454
test := With(t)
4555

4656
// Create a namespace
4757
namespace := test.NewTestNamespace()
4858

59+
// Get ray image
60+
rayImage := GetRayImage()
61+
62+
mnistRay(test, numGpus, namespace, rayImage)
63+
}
64+
65+
func mnistCustomRayImage(t *testing.T, numGpus int) {
66+
test := With(t)
67+
68+
// Create a namespace
69+
namespace := test.NewTestNamespace()
70+
71+
// Build and Push custom ray image
72+
image := "ray-torch"
73+
buildAndPushRayImage(test, namespace.Name, image)
74+
75+
// Get custom ray image
76+
rayImage := getCustomRayImage(test, namespace.Name, image)
77+
78+
mnistRay(test, numGpus, namespace, rayImage)
79+
}
80+
81+
func mnistRay(test Test, numGpus int, namespace *corev1.Namespace, rayImage string) {
4982
// Create Kueue resources
5083
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
5184
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
@@ -103,7 +136,7 @@ func mnistRay(t *testing.T, numGpus int) {
103136
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
104137

105138
// Create Notebook CR
106-
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
139+
createNotebook(test, namespace, userToken, rayImage, config.Name, jupyterNotebookConfigMapFileName, numGpus)
107140

108141
// Gracefully cleanup Notebook
109142
defer func() {
@@ -141,10 +174,12 @@ func mnistRay(t *testing.T, numGpus int) {
141174
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to create new raycluster client: %s", err))
142175

143176
// wait until rayjob exists
144-
test.Eventually(func() []RayJobDetailsResponse {
177+
test.Eventually(func() ([]RayJobDetailsResponse, error) {
145178
rayJobs, err := rayClient.GetJobs()
146-
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to fetch ray-jobs : %s", err))
147-
return *rayJobs
179+
if err != nil {
180+
return *rayJobs, err
181+
}
182+
return *rayJobs, nil
148183
}, TestTimeoutMedium, 1*time.Second).Should(HaveLen(1), "Ray job not found")
149184

150185
// Get test job-id
@@ -154,21 +189,24 @@ func mnistRay(t *testing.T, numGpus int) {
154189
// Wait for the job to be succeeded or failed
155190
var rayJobStatus string
156191
test.T().Logf("Waiting for job to be Succeeded...\n")
157-
test.Eventually(func() string {
192+
test.Eventually(func() (string, error) {
158193
resp, err := rayClient.GetJobDetails(jobID)
159-
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to get job details :%s", err))
194+
if err != nil {
195+
return rayJobStatus, err
196+
}
160197
rayJobStatusVal := resp.Status
161198
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
162199
test.T().Logf("JobStatus - %s\n", rayJobStatusVal)
163200
rayJobStatus = rayJobStatusVal
164-
return rayJobStatus
201+
return rayJobStatus, nil
165202
}
166203
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
167204
test.T().Logf("JobStatus - %s...\n", rayJobStatusVal)
168205
rayJobStatus = rayJobStatusVal
169206
}
170-
return rayJobStatus
207+
return rayJobStatus, nil
171208
}, TestTimeoutDouble, 1*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
209+
172210
// Store job logs in output directory
173211
WriteRayJobAPILogs(test, rayClient, jobID)
174212

@@ -216,3 +254,25 @@ func readMnistScriptTemplate(test Test, filePath string) []byte {
216254

217255
return ParseTemplate(test, template, props)
218256
}
257+
258+
func buildAndPushRayImage(test Test, namespace string, image string) {
259+
cmd := exec.Command("resources/custom_image.sh", namespace, image, "-c", "echo stdout; echo 1>&2 stderr")
260+
261+
stdoutStderr, err := cmd.CombinedOutput()
262+
if err != nil {
263+
log.Fatal("Error executing custom_image script :", err)
264+
}
265+
test.Expect(err).NotTo(HaveOccurred())
266+
267+
fmt.Printf("Logs of build and custom ray image . . .\n %s", stdoutStderr)
268+
}
269+
270+
func getCustomRayImage(test Test, namespace string, image string) string {
271+
tag := "latest"
272+
name := image + ":" + tag
273+
274+
imageStreamTag := GetImageStreamTag(test, namespace, name)
275+
imageReference := imageStreamTag.Image.DockerImageReference
276+
277+
return imageReference
278+
}

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,11 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
103103
// Create role binding with Namespace specific admin cluster role
104104
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
105105

106+
// Get ray image
107+
rayImage := GetRayImage()
108+
106109
// Create Notebook CR
107-
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
110+
createNotebook(test, namespace, userToken, rayImage, config.Name, jupyterNotebookConfigMapFileName, numGpus)
108111

109112
// Gracefully cleanup Notebook
110113
defer func() {
@@ -141,11 +144,13 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
141144
rayClient, err := NewRayClusterClient(rayClusterClientConfig, test.Config().BearerToken)
142145
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to create new raycluster client: %s", err))
143146

144-
// Wait until the rayjob is created and running
145-
test.Eventually(func() []RayJobDetailsResponse {
147+
// wait until rayjob exists
148+
test.Eventually(func() ([]RayJobDetailsResponse, error) {
146149
rayJobs, err := rayClient.GetJobs()
147-
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to fetch ray-jobs : %s", err))
148-
return *rayJobs
150+
if err != nil {
151+
return *rayJobs, err
152+
}
153+
return *rayJobs, nil
149154
}, TestTimeoutMedium, 1*time.Second).Should(HaveLen(1), "Ray job not found")
150155

151156
// Get rayjob-ID
@@ -155,21 +160,24 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
155160
// Wait for the job to either succeed or fail
156161
var rayJobStatus string
157162
test.T().Logf("Waiting for job to be Succeeded...\n")
158-
test.Eventually(func() string {
163+
test.Eventually(func() (string, error) {
159164
resp, err := rayClient.GetJobDetails(jobID)
160-
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to get job details :%s", err))
165+
if err != nil {
166+
return rayJobStatus, err
167+
}
161168
rayJobStatusVal := resp.Status
162169
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
163170
test.T().Logf("JobStatus - %s\n", rayJobStatusVal)
164171
rayJobStatus = rayJobStatusVal
165-
return rayJobStatus
172+
return rayJobStatus, nil
166173
}
167174
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
168175
test.T().Logf("JobStatus - %s...\n", rayJobStatusVal)
169176
rayJobStatus = rayJobStatusVal
170177
}
171-
return rayJobStatus
178+
return rayJobStatus, nil
172179
}, TestTimeoutDouble, 1*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
180+
173181
// Store job logs in output directory
174182
WriteRayJobAPILogs(test, rayClient, jobID)
175183

tests/odh/notebook.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ type NotebookProps struct {
5151
S3DefaultRegion string
5252
}
5353

54-
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
54+
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, rayImage string, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
5555
// Create PVC for Notebook
5656
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce)
5757
s3BucketName, s3BucketNameExists := GetStorageBucketName()
@@ -73,7 +73,7 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, j
7373
KubernetesUserBearerToken: notebookUserToken,
7474
Namespace: namespace.Name,
7575
OpenDataHubNamespace: GetOpenDataHubNamespace(test),
76-
RayImage: GetRayImage(),
76+
RayImage: rayImage,
7777
NotebookImage: GetNotebookImage(test),
7878
NotebookConfigMapName: jupyterNotebookConfigMapName,
7979
NotebookConfigMapFileName: jupyterNotebookConfigMapFileName,

tests/odh/ray_finetune_llm_deepspeed_test.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,11 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
9999

100100
config := CreateConfigMap(test, namespace.Name, configMap)
101101

102+
// Get ray image
103+
rayImage := GetRayImage()
104+
102105
// Create Notebook CR
103-
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
106+
createNotebook(test, namespace, userToken, rayImage, config.Name, jupyterNotebookConfigMapFileName, numGpus)
104107

105108
// Gracefully cleanup Notebook
106109
defer func() {
@@ -129,10 +132,12 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
129132
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to create new raycluster client: %s", err))
130133

131134
// wait until rayjob exists
132-
test.Eventually(func() []RayJobDetailsResponse {
135+
test.Eventually(func() ([]RayJobDetailsResponse, error) {
133136
rayJobs, err := rayClient.GetJobs()
134-
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to fetch ray-jobs : %s", err))
135-
return *rayJobs
137+
if err != nil {
138+
return *rayJobs, err
139+
}
140+
return *rayJobs, nil
136141
}, TestTimeoutMedium, 1*time.Second).Should(HaveLen(1), "Ray job not found")
137142

138143
// Get test job-id
@@ -142,21 +147,24 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
142147
// Wait for the job to be succeeded or failed
143148
var rayJobStatus string
144149
test.T().Logf("Waiting for job to be Succeeded...\n")
145-
test.Eventually(func() string {
150+
test.Eventually(func() (string, error) {
146151
resp, err := rayClient.GetJobDetails(jobID)
147-
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to get job details :%s", err))
152+
if err != nil {
153+
return rayJobStatus, err
154+
}
148155
rayJobStatusVal := resp.Status
149156
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
150157
test.T().Logf("JobStatus - %s\n", rayJobStatusVal)
151158
rayJobStatus = rayJobStatusVal
152-
return rayJobStatus
159+
return rayJobStatus, nil
153160
}
154161
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
155162
test.T().Logf("JobStatus - %s...\n", rayJobStatusVal)
156163
rayJobStatus = rayJobStatusVal
157164
}
158-
return rayJobStatus
165+
return rayJobStatus, nil
159166
}, TestTimeoutDouble, 1*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
167+
160168
// Store job logs in output directory
161169
WriteRayJobAPILogs(test, rayClient, jobID)
162170

tests/odh/resources/custom_image.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
namespace=$1
4+
image=$2
5+
6+
echo "switch to current project . . ."
7+
oc project $namespace
8+
9+
echo "Build custom container image using podman . . ."
10+
cd ./../../images/runtime/examples
11+
cd $image
12+
podman build -t $image -f Dockerfile
13+
14+
echo "Expose the integrated container registry . . ."
15+
oc patch configs.imageregistry.operator.openshift.io/cluster --patch '{"spec":{"defaultRoute":true}}' --type=merge
16+
17+
echo "Wait for the route to the container registry to be admitted . . ."
18+
oc wait -n openshift-image-registry route/default-route --for=jsonpath='{.status.ingress[0].conditions[0].status}'=True
19+
20+
echo "Login to the container registry . . ."
21+
podman login -u $(oc whoami) -p $(oc whoami -t) $(oc registry info)
22+
23+
echo "Push the image to the integrated container registry . . ."
24+
podman tag $image $(oc registry info)/$namespace/$image
25+
podman push $(oc registry info)/$namespace/$image
26+
27+
echo "Custom Ray Image is . . . "
28+
oc get is $image -o jsonpath='{.status.tags[?(@.tag=="latest")].items[0].dockerImageReference}'

0 commit comments

Comments
 (0)