Skip to content

Commit 3ca59ee

Browse files
committed
Merge remote-tracking branch 'origin/main'
2 parents 4d90bb8 + 59fac38 commit 3ca59ee

11 files changed

+159
-40
lines changed

images/tests/.env-odh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
FMS_HF_TUNING_IMAGE=quay.io/modh/fms-hf-tuning:release
2+
NOTEBOOK_IMAGE=quay.io/modh/odh-workbench-jupyter-datascience-cpu-py311-ubi9:rhoai-2.22

images/tests/Dockerfile

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,9 @@ WORKDIR /distributed-workloads/tests
2424
# Copy the source from the current directory to the working directory inside the container
2525
COPY tests/ .
2626

27+
# Copy the .env-odh file and the run-test.sh script to the working directory inside the container
28+
COPY images/tests/.env-odh .env-odh
29+
COPY images/tests/run-test.sh .
30+
2731
# Command to run the tests
28-
ENTRYPOINT [ "gotestsum"]
29-
30-
# Configure images using environment variables
31-
ENV FMS_HF_TUNING_IMAGE=<replace_me>
32-
ENV TEST_TRAINING_CUDA_PYTORCH_241_IMAGE=<replace_me>
33-
ENV TEST_TRAINING_ROCM_PYTORCH_241_IMAGE=<replace_me>
34-
ENV TEST_TRAINING_CUDA_PYTORCH_251_IMAGE=<replace_me>
35-
ENV TEST_TRAINING_ROCM_PYTORCH_251_IMAGE=<replace_me>
36-
ENV NOTEBOOK_IMAGE=<replace_me>
32+
ENTRYPOINT ["bash", "run-test.sh"]

images/tests/run-test.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
set -o allexport
4+
source .env-odh
5+
set +o allexport
6+
7+
gotestsum "$@"

tests/common/notebook.go

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,35 @@ func readFile(t Test, fileName string) []byte {
4747
return file
4848
}
4949

50+
var (
51+
SmallContainerResources = ContainerResources{
52+
Limits: ResourceConfig{CPU: "2", Memory: "3Gi"},
53+
Requests: ResourceConfig{CPU: "1", Memory: "3Gi"},
54+
}
55+
MediumContainerResources = ContainerResources{
56+
Limits: ResourceConfig{CPU: "6", Memory: "24Gi"},
57+
Requests: ResourceConfig{CPU: "3", Memory: "24Gi"},
58+
}
59+
)
60+
61+
type ResourceConfig struct {
62+
CPU string
63+
Memory string
64+
GPUResourceLabel string // e.g., "nvidia.com/gpu", "amd.com/gpu", or ""
65+
}
66+
67+
type ContainerResources struct {
68+
Limits ResourceConfig
69+
Requests ResourceConfig
70+
}
71+
72+
type ContainerSize string
73+
74+
const (
75+
ContainerSizeSmall ContainerSize = "small"
76+
ContainerSizeMedium ContainerSize = "medium"
77+
)
78+
5079
var notebookResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1", Resource: "notebooks"}
5180

5281
type NotebookProps struct {
@@ -68,9 +97,11 @@ type NotebookProps struct {
6897
S3SecretAccessKey string
6998
S3Endpoint string
7099
S3DefaultRegion string
100+
NotebookResources ContainerResources
101+
SizeSelection ContainerSize
71102
}
72103

73-
func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken string, command []string, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int, notebookPVC *corev1.PersistentVolumeClaim) {
104+
func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken string, command []string, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int, notebookPVC *corev1.PersistentVolumeClaim, containerSize ContainerSize, acceleratorResourceLabel ...string) {
74105
s3BucketName, s3BucketNameExists := GetStorageBucketName()
75106
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
76107
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
@@ -86,6 +117,36 @@ func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken st
86117
s3DefaultRegion = "''"
87118
}
88119

120+
var selectedContainerResources ContainerResources
121+
var gpuResourceLabel string
122+
if len(acceleratorResourceLabel) == 1 {
123+
gpuResourceLabel = acceleratorResourceLabel[0]
124+
} else {
125+
gpuResourceLabel = ""
126+
}
127+
128+
if containerSize == ContainerSizeSmall {
129+
selectedContainerResources = SmallContainerResources
130+
// For small, ensure no GPU resource is requested
131+
selectedContainerResources.Limits.GPUResourceLabel = ""
132+
selectedContainerResources.Requests.GPUResourceLabel = ""
133+
} else if containerSize == ContainerSizeMedium {
134+
selectedContainerResources = MediumContainerResources
135+
136+
if gpuResourceLabel != "" && gpuResourceLabel != NVIDIA.ResourceLabel && gpuResourceLabel != AMD.ResourceLabel {
137+
test.T().Errorf("Unsupported GPU resource label for medium size: %s. Must be '%s', '%s', or an empty string.", gpuResourceLabel, NVIDIA.ResourceLabel, AMD.ResourceLabel)
138+
gpuResourceLabel = "" // Fallback to no GPU if label is invalid
139+
}
140+
141+
// Apply the determined GPUResourceLabel
142+
selectedContainerResources.Limits.GPUResourceLabel = gpuResourceLabel
143+
selectedContainerResources.Requests.GPUResourceLabel = gpuResourceLabel
144+
} else {
145+
test.T().Errorf("Unsupported container size: %s. Must be '%s' or '%s'. Hence using '%s' container size.",
146+
containerSize, ContainerSizeSmall, ContainerSizeMedium, ContainerSizeSmall)
147+
selectedContainerResources = SmallContainerResources // Fallback to Small container size
148+
}
149+
89150
// Read the Notebook CR from resources and perform replacements for custom values using go template
90151
notebookProps := NotebookProps{
91152
IngressDomain: GetOpenShiftIngressDomain(test),
@@ -106,6 +167,8 @@ func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken st
106167
S3DefaultRegion: s3DefaultRegion,
107168
PipIndexUrl: GetPipIndexURL(),
108169
PipTrustedHost: GetPipTrustedHost(),
170+
NotebookResources: selectedContainerResources,
171+
SizeSelection: containerSize,
109172
}
110173
notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
111174
test.Expect(err).NotTo(gomega.HaveOccurred())

tests/common/resources/custom-nb-small.yaml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ metadata:
1111
annotations:
1212
notebooks.opendatahub.io/inject-oauth: "true"
1313
notebooks.opendatahub.io/last-image-selection: codeflare-notebook:latest
14-
notebooks.opendatahub.io/last-size-selection: Small
14+
notebooks.opendatahub.io/last-size-selection: {{.SizeSelection}}
1515
notebooks.opendatahub.io/oauth-logout-url: https://odh-dashboard-{{.OpenDataHubNamespace}}.{{.IngressDomain}}/notebookController/kube-3aadmin/home
1616
opendatahub.io/link: https://jupyter-nb-kube-3aadmin-{{.Namespace}}.{{.IngressDomain}}/notebook/{{.Namespace}}/jupyter-nb-kube-3aadmin
1717
opendatahub.io/username: kube:admin
@@ -75,11 +75,17 @@ spec:
7575
protocol: TCP
7676
resources:
7777
limits:
78-
cpu: "2"
79-
memory: 3Gi
78+
cpu: "{{ .NotebookResources.Limits.CPU }}"
79+
memory: "{{ .NotebookResources.Limits.Memory }}"
80+
{{ if .NotebookResources.Limits.GPUResourceLabel }} # Check if GPUResourceLabel is NOT empty
81+
{{ .NotebookResources.Limits.GPUResourceLabel }}: 1
82+
{{ end }}
8083
requests:
81-
cpu: "1"
82-
memory: 3Gi
84+
cpu: "{{ .NotebookResources.Requests.CPU }}"
85+
memory: "{{ .NotebookResources.Requests.Memory }}"
86+
{{ if .NotebookResources.Requests.GPUResourceLabel }} # Check if GPUResourceLabel is NOT empty
87+
{{ .NotebookResources.Requests.GPUResourceLabel }}: 1
88+
{{ end }}
8389
volumeMounts:
8490
- mountPath: /opt/app-root/src
8591
name: jupyterhub-nb-kube-3aadmin-pvc

tests/kfto/kfto_mnist_sdk_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ func runMnistSDK(t *testing.T, trainingImage string) {
114114
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
115115

116116
// Create Notebook CR
117-
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, 0, notebookPVC)
117+
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, 0, notebookPVC, ContainerSizeSmall)
118118

119119
// Gracefully cleanup Notebook
120120
defer func() {

tests/kfto/kfto_sft_llm_test.go

Lines changed: 65 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,26 @@ import (
2929
. "github.com/onsi/gomega"
3030

3131
corev1 "k8s.io/api/core/v1"
32+
"k8s.io/apimachinery/pkg/api/resource"
3233
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
3335

3436
. "github.com/opendatahub-io/distributed-workloads/tests/common"
3537
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
3638
"github.com/opendatahub-io/distributed-workloads/tests/odh"
3739
)
3840

39-
func TestKftoSftLlmLlama3_1_8BInstruct(t *testing.T) {
41+
func TestKftoSftLlmLlama3_1_8BInstructWithCudaPyTorch251(t *testing.T) {
4042
Tags(t, KftoCuda)
41-
kftoSftLlm(t, "meta-llama/Llama-3.1-8B-Instruct")
43+
kftoSftLlm(t, GetTrainingCudaPyTorch251Image(), NVIDIA, "meta-llama/Llama-3.1-8B-Instruct")
4244
}
4345

44-
func kftoSftLlm(t *testing.T, modelName string) {
46+
func TestKftoSftLlmLlama3_1_8BInstructWithROCmPyTorch251(t *testing.T) {
47+
Tags(t, KftoRocm)
48+
kftoSftLlm(t, GetTrainingROCmPyTorch251Image(), AMD, "meta-llama/Llama-3.1-8B-Instruct")
49+
}
50+
51+
func kftoSftLlm(t *testing.T, image string, gpu Accelerator, modelName string) {
4552
test := With(t)
4653

4754
// Create a namespace
@@ -66,28 +73,66 @@ func kftoSftLlm(t *testing.T, modelName string) {
6673
// Create PVC for Notebook
6774
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "500Gi", AccessModes(corev1.ReadWriteMany), StorageClassName(storageClass.Name))
6875

76+
// Create Kueue resources
77+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
78+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
79+
cqSpec := v1beta1.ClusterQueueSpec{
80+
NamespaceSelector: &metav1.LabelSelector{},
81+
ResourceGroups: []v1beta1.ResourceGroup{
82+
{
83+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName(gpu.ResourceLabel)},
84+
Flavors: []v1beta1.FlavorQuotas{
85+
{
86+
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
87+
Resources: []v1beta1.ResourceQuota{
88+
{
89+
Name: corev1.ResourceCPU,
90+
NominalQuota: resource.MustParse("32"),
91+
},
92+
{
93+
Name: corev1.ResourceMemory,
94+
NominalQuota: resource.MustParse("512Gi"),
95+
},
96+
{
97+
Name: corev1.ResourceName(gpu.ResourceLabel),
98+
NominalQuota: resource.MustParse("8"),
99+
},
100+
},
101+
},
102+
},
103+
},
104+
},
105+
}
106+
107+
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
108+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
109+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
110+
69111
// Read and update the notebook content
70112
notebookContent := odh.ReadFileExt(test, workingDirectory+"/../../examples/kfto-sft-llm/sft.ipynb")
71113
updatedNotebookContent := string(notebookContent)
72114

73115
// Update notebook parameters for testing
74116
requiredChangesInNotebook := map[string]string{
75117
"model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct": fmt.Sprintf("model_name_or_path: %s", modelName),
76-
"num_train_epochs: 10": "num_train_epochs: 1",
77-
"output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct": fmt.Sprintf("output_dir: /mnt/shared/%s", modelName),
78-
"api_server = \\\"<API_SERVER>\\\"": fmt.Sprintf("api_server = \\\"%s\\\"", GetOpenShiftApiUrl(test)),
79-
"token = \\\"<TOKEN>\\\"": fmt.Sprintf("token = \\\"%s\\\"", userToken),
80-
"#configuration.verify_ssl = False": "configuration.verify_ssl = False",
81-
"name=\\\"sft\\\"": fmt.Sprintf("name=\\\"sft-%s\\\"", namespace.Name),
82-
"\"HF_TOKEN\\\": \\\"\\\"": fmt.Sprintf("\"HF_TOKEN\\\": \\\"%s\\\"", hfToken),
83-
"claim_name=\\\"shared\\\"": fmt.Sprintf("claim_name=\\\"%s\\\"", notebookPVC.Name),
84-
"eval_strategy: epoch": "eval_strategy: 'no'",
85-
"logging_steps: 1": "logging_steps: 10",
86-
"\"client.get_job_logs(\\n\",": "\"client.wait_for_job_conditions(\\n\",",
87-
"\" follow=True,\\n\",": "\" wait_timeout=1800,\\n\",\n\t\" polling_interval=60,\\n\",",
88-
"os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]": "#os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]",
89-
"%load_ext tensorboard": "#%load_ext tensorboard",
90-
"%tensorboard --logdir /opt/app-root/src/shared": "#%tensorboard --logdir /opt/app-root/src/shared",
118+
"num_train_epochs: 10": "num_train_epochs: 1",
119+
"eval_strategy: epoch": "eval_strategy: 'no'",
120+
"logging_steps: 1": "logging_steps: 10",
121+
"output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct": fmt.Sprintf("output_dir: /mnt/shared/%s", modelName),
122+
"api_server = \\\"<API_SERVER>\\\"": fmt.Sprintf("api_server = \\\"%s\\\"", GetOpenShiftApiUrl(test)),
123+
"token = \\\"<TOKEN>\\\"": fmt.Sprintf("token = \\\"%s\\\"", userToken),
124+
"#configuration.verify_ssl = False": "configuration.verify_ssl = False",
125+
"name=\\\"sft\\\"": fmt.Sprintf("name=\\\"sft-%s\\\"", namespace.Name),
126+
"train_func=main,": fmt.Sprintf("labels= {\\n\",\n\t\" \\\"kueue.x-k8s.io/queue-name\\\": \\\"%s\\\"\\n\",\n\t\" },\\n\",\n\t\" train_func=main,", localQueue.Name),
127+
" \\\"nvidia.com/gpu\\\"": fmt.Sprintf(" \\\"%s\\\"", gpu.ResourceLabel),
128+
"base_image=\\\"quay.io/modh/training:py311-cuda124-torch251\\\"": fmt.Sprintf("base_image=\\\"%s\\\"", image),
129+
"\"HF_TOKEN\\\": \\\"\\\"": fmt.Sprintf("\"HF_TOKEN\\\": \\\"%s\\\"", hfToken),
130+
"claim_name=\\\"shared\\\"": fmt.Sprintf("claim_name=\\\"%s\\\"", notebookPVC.Name),
131+
"\"client.get_job_logs(\\n\",": "\"client.wait_for_job_conditions(\\n\",",
132+
"\" follow=True,\\n\",": "\" wait_timeout=1800,\\n\",\n\t\" polling_interval=60,\\n\",",
133+
"os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]": "#os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]",
134+
"%load_ext tensorboard": "#%load_ext tensorboard",
135+
"%tensorboard --logdir /opt/app-root/src/shared": "#%tensorboard --logdir /opt/app-root/src/shared",
91136
"pretrained_path = \\\"/opt/app-root/src/shared/.cache/hub/models--Meta-Llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/\\\"": "pretrained_path = \\\"/opt/app-root/src/.cache/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/\\\"",
92137
"# Test the pre-trained model": "# Test the pre-trained model\\n\",\n\"from IPython.display import Markdown, display\\n\",\n\"import os",
93138
"display(Markdown(output1))": "display(Markdown(output1))\\n\",\n\"\\n\",\n\"# Save to file\\n\",\n\"output_path = \\\"/opt/app-root/src/pretrained_output.md\\\"\\n\",\n\"os.makedirs(os.path.dirname(output_path), exist_ok=True)\\n\",\n\"with open(output_path, \\\"w\\\") as f:\\n\",\n\t\" f.write(output1)",
@@ -128,7 +173,7 @@ func kftoSftLlm(t *testing.T, modelName string) {
128173
}
129174

130175
// Create Notebook CR
131-
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, "sft.ipynb", 1, notebookPVC)
176+
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, "sft.ipynb", 1, notebookPVC, ContainerSizeMedium, gpu.ResourceLabel)
132177

133178
// Gracefully cleanup Notebook
134179
defer func() {
@@ -177,7 +222,7 @@ func kftoSftLlm(t *testing.T, modelName string) {
177222
test.T().Logf("Notebook execution completed as indicated by marker file: '%s'", markerContent)
178223
return true
179224
} else if markerContent == "FAILURE" {
180-
errMessage := fmt.Sprintf("Notebook execution failed as indicated by marker file: '%s'", markerContent)
225+
errMessage := fmt.Sprintf("Notebook execution failed as indicated by marker file '%s': %s", markerPath, markerContent)
181226
test.T().Errorf("%s", errMessage)
182227
notebookExecutionFinalError = fmt.Errorf("%s", errMessage)
183228
// Return true to stop Eventually polling, because a *final* state has been reached

tests/odh/mnist_ray_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ func mnistRay(t *testing.T, numGpus int, gpuResourceName string, rayImage string
131131

132132
notebookCommand := getNotebookCommand(rayImage)
133133
// Create Notebook CR
134-
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
134+
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
135135

136136
// Gracefully cleanup Notebook
137137
defer func() {

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
114114
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
115115

116116
// Create Notebook CR
117-
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
117+
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
118118

119119
// Gracefully cleanup Notebook
120120
defer func() {

tests/odh/ray_finetune_llm_deepspeed_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
113113
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
114114

115115
// Create Notebook CR
116-
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
116+
CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
117117

118118
// Gracefully cleanup Notebook
119119
defer func() {

0 commit comments

Comments
 (0)