Merge remote-tracking branch 'origin/main'

sutaakar · sutaakar · commit 3ca59ee5d0a4 · 2025-06-25T13:01:27.000+02:00
diff --git a/images/tests/.env-odh b/images/tests/.env-odh
@@ -0,0 +1,2 @@
+FMS_HF_TUNING_IMAGE=quay.io/modh/fms-hf-tuning:release
+NOTEBOOK_IMAGE=quay.io/modh/odh-workbench-jupyter-datascience-cpu-py311-ubi9:rhoai-2.22
diff --git a/images/tests/Dockerfile b/images/tests/Dockerfile
@@ -24,13 +24,9 @@ WORKDIR /distributed-workloads/tests
 # Copy the source from the current directory to the working directory inside the container
 COPY tests/ .
 
+# Copy the .env-odh file and the run-test.sh script to the working directory inside the container
+COPY images/tests/.env-odh .env-odh
+COPY images/tests/run-test.sh .
+
 # Command to run the tests
-ENTRYPOINT [ "gotestsum"]
-
-# Configure images using environment variables
-ENV FMS_HF_TUNING_IMAGE=<replace_me>
-ENV TEST_TRAINING_CUDA_PYTORCH_241_IMAGE=<replace_me>
-ENV TEST_TRAINING_ROCM_PYTORCH_241_IMAGE=<replace_me>
-ENV TEST_TRAINING_CUDA_PYTORCH_251_IMAGE=<replace_me>
-ENV TEST_TRAINING_ROCM_PYTORCH_251_IMAGE=<replace_me>
-ENV NOTEBOOK_IMAGE=<replace_me>
+ENTRYPOINT ["bash", "run-test.sh"]
diff --git a/images/tests/run-test.sh b/images/tests/run-test.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -o allexport
+source .env-odh
+set +o allexport
+
+gotestsum "$@"
diff --git a/tests/common/notebook.go b/tests/common/notebook.go
@@ -47,6 +47,35 @@ func readFile(t Test, fileName string) []byte {
 	return file
 }
 
+var (
+	SmallContainerResources = ContainerResources{
+		Limits:   ResourceConfig{CPU: "2", Memory: "3Gi"},
+		Requests: ResourceConfig{CPU: "1", Memory: "3Gi"},
+	}
+	MediumContainerResources = ContainerResources{
+		Limits:   ResourceConfig{CPU: "6", Memory: "24Gi"},
+		Requests: ResourceConfig{CPU: "3", Memory: "24Gi"},
+	}
+)
+
+type ResourceConfig struct {
+	CPU              string
+	Memory           string
+	GPUResourceLabel string // e.g., "nvidia.com/gpu", "amd.com/gpu", or ""
+}
+
+type ContainerResources struct {
+	Limits   ResourceConfig
+	Requests ResourceConfig
+}
+
+type ContainerSize string
+
+const (
+	ContainerSizeSmall  ContainerSize = "small"
+	ContainerSizeMedium ContainerSize = "medium"
+)
+
 var notebookResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1", Resource: "notebooks"}
 
 type NotebookProps struct {
@@ -68,9 +97,11 @@ type NotebookProps struct {
 	S3SecretAccessKey         string
 	S3Endpoint                string
 	S3DefaultRegion           string
+	NotebookResources         ContainerResources
+	SizeSelection             ContainerSize
 }
 
-func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken string, command []string, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int, notebookPVC *corev1.PersistentVolumeClaim) {
+func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken string, command []string, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int, notebookPVC *corev1.PersistentVolumeClaim, containerSize ContainerSize, acceleratorResourceLabel ...string) {
 	s3BucketName, s3BucketNameExists := GetStorageBucketName()
 	s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
 	s3SecretAccessKey, _ := GetStorageBucketSecretKey()
@@ -86,6 +117,36 @@ func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken st
 		s3DefaultRegion = "''"
 	}
 
+	var selectedContainerResources ContainerResources
+	var gpuResourceLabel string
+	if len(acceleratorResourceLabel) == 1 {
+		gpuResourceLabel = acceleratorResourceLabel[0]
+	} else {
+		gpuResourceLabel = ""
+	}
+
+	if containerSize == ContainerSizeSmall {
+		selectedContainerResources = SmallContainerResources
+		// For small, ensure no GPU resource is requested
+		selectedContainerResources.Limits.GPUResourceLabel = ""
+		selectedContainerResources.Requests.GPUResourceLabel = ""
+	} else if containerSize == ContainerSizeMedium {
+		selectedContainerResources = MediumContainerResources
+
+		if gpuResourceLabel != "" && gpuResourceLabel != NVIDIA.ResourceLabel && gpuResourceLabel != AMD.ResourceLabel {
+			test.T().Errorf("Unsupported GPU resource label for medium size: %s. Must be '%s', '%s', or an empty string.", gpuResourceLabel, NVIDIA.ResourceLabel, AMD.ResourceLabel)
+			gpuResourceLabel = "" // Fallback to no GPU if label is invalid
+		}
+
+		// Apply the determined GPUResourceLabel
+		selectedContainerResources.Limits.GPUResourceLabel = gpuResourceLabel
+		selectedContainerResources.Requests.GPUResourceLabel = gpuResourceLabel
+	} else {
+		test.T().Errorf("Unsupported container size: %s. Must be '%s' or '%s'. Hence using '%s' container size.",
+			containerSize, ContainerSizeSmall, ContainerSizeMedium, ContainerSizeSmall)
+		selectedContainerResources = SmallContainerResources // Fallback to Small container size
+	}
+
 	// Read the Notebook CR from resources and perform replacements for custom values using go template
 	notebookProps := NotebookProps{
 		IngressDomain:             GetOpenShiftIngressDomain(test),
@@ -106,6 +167,8 @@ func CreateNotebook(test Test, namespace *corev1.Namespace, notebookUserToken st
 		S3DefaultRegion:           s3DefaultRegion,
 		PipIndexUrl:               GetPipIndexURL(),
 		PipTrustedHost:            GetPipTrustedHost(),
+		NotebookResources:         selectedContainerResources,
+		SizeSelection:             containerSize,
 	}
 	notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
 	test.Expect(err).NotTo(gomega.HaveOccurred())
diff --git a/tests/common/resources/custom-nb-small.yaml b/tests/common/resources/custom-nb-small.yaml
@@ -11,7 +11,7 @@ metadata:
   annotations:
     notebooks.opendatahub.io/inject-oauth: "true"
     notebooks.opendatahub.io/last-image-selection: codeflare-notebook:latest
-    notebooks.opendatahub.io/last-size-selection: Small
+    notebooks.opendatahub.io/last-size-selection: {{.SizeSelection}}
     notebooks.opendatahub.io/oauth-logout-url: https://odh-dashboard-{{.OpenDataHubNamespace}}.{{.IngressDomain}}/notebookController/kube-3aadmin/home
     opendatahub.io/link: https://jupyter-nb-kube-3aadmin-{{.Namespace}}.{{.IngressDomain}}/notebook/{{.Namespace}}/jupyter-nb-kube-3aadmin
     opendatahub.io/username: kube:admin
@@ -75,11 +75,17 @@ spec:
           protocol: TCP
         resources:
           limits:
-            cpu: "2"
-            memory: 3Gi
+            cpu: "{{ .NotebookResources.Limits.CPU }}"
+            memory: "{{ .NotebookResources.Limits.Memory }}"
+            {{ if .NotebookResources.Limits.GPUResourceLabel }} # Check if GPUResourceLabel is NOT empty
+            {{ .NotebookResources.Limits.GPUResourceLabel }}: 1
+            {{ end }}
           requests:
-            cpu: "1"
-            memory: 3Gi
+            cpu: "{{ .NotebookResources.Requests.CPU }}"
+            memory: "{{ .NotebookResources.Requests.Memory }}"
+            {{ if .NotebookResources.Requests.GPUResourceLabel }} # Check if GPUResourceLabel is NOT empty
+            {{ .NotebookResources.Requests.GPUResourceLabel }}: 1
+            {{ end }}
         volumeMounts:
         - mountPath: /opt/app-root/src
           name: jupyterhub-nb-kube-3aadmin-pvc
diff --git a/tests/kfto/kfto_mnist_sdk_test.go b/tests/kfto/kfto_mnist_sdk_test.go
@@ -114,7 +114,7 @@ func runMnistSDK(t *testing.T, trainingImage string) {
 	notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
 
 	// Create Notebook CR
-	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, 0, notebookPVC)
+	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, 0, notebookPVC, ContainerSizeSmall)
 
 	// Gracefully cleanup Notebook
 	defer func() {
diff --git a/tests/kfto/kfto_sft_llm_test.go b/tests/kfto/kfto_sft_llm_test.go
@@ -29,19 +29,26 @@ import (
 	. "github.com/onsi/gomega"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/kueue/apis/kueue/v1beta1"
 
 	. "github.com/opendatahub-io/distributed-workloads/tests/common"
 	. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
 	"github.com/opendatahub-io/distributed-workloads/tests/odh"
 )
 
-func TestKftoSftLlmLlama3_1_8BInstruct(t *testing.T) {
+func TestKftoSftLlmLlama3_1_8BInstructWithCudaPyTorch251(t *testing.T) {
 	Tags(t, KftoCuda)
-	kftoSftLlm(t, "meta-llama/Llama-3.1-8B-Instruct")
+	kftoSftLlm(t, GetTrainingCudaPyTorch251Image(), NVIDIA, "meta-llama/Llama-3.1-8B-Instruct")
 }
 
-func kftoSftLlm(t *testing.T, modelName string) {
+func TestKftoSftLlmLlama3_1_8BInstructWithROCmPyTorch251(t *testing.T) {
+	Tags(t, KftoRocm)
+	kftoSftLlm(t, GetTrainingROCmPyTorch251Image(), AMD, "meta-llama/Llama-3.1-8B-Instruct")
+}
+
+func kftoSftLlm(t *testing.T, image string, gpu Accelerator, modelName string) {
 	test := With(t)
 
 	// Create a namespace
@@ -66,28 +73,66 @@ func kftoSftLlm(t *testing.T, modelName string) {
 	// Create PVC for Notebook
 	notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "500Gi", AccessModes(corev1.ReadWriteMany), StorageClassName(storageClass.Name))
 
+	// Create Kueue resources
+	resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
+	defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
+	cqSpec := v1beta1.ClusterQueueSpec{
+		NamespaceSelector: &metav1.LabelSelector{},
+		ResourceGroups: []v1beta1.ResourceGroup{
+			{
+				CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName(gpu.ResourceLabel)},
+				Flavors: []v1beta1.FlavorQuotas{
+					{
+						Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
+						Resources: []v1beta1.ResourceQuota{
+							{
+								Name:         corev1.ResourceCPU,
+								NominalQuota: resource.MustParse("32"),
+							},
+							{
+								Name:         corev1.ResourceMemory,
+								NominalQuota: resource.MustParse("512Gi"),
+							},
+							{
+								Name:         corev1.ResourceName(gpu.ResourceLabel),
+								NominalQuota: resource.MustParse("8"),
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	clusterQueue := CreateKueueClusterQueue(test, cqSpec)
+	defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
+	localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
+
 	// Read and update the notebook content
 	notebookContent := odh.ReadFileExt(test, workingDirectory+"/../../examples/kfto-sft-llm/sft.ipynb")
 	updatedNotebookContent := string(notebookContent)
 
 	// Update notebook parameters for testing
 	requiredChangesInNotebook := map[string]string{
 		"model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct": fmt.Sprintf("model_name_or_path: %s", modelName),
-		"num_train_epochs: 10":                               "num_train_epochs: 1",
-		"output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct": fmt.Sprintf("output_dir: /mnt/shared/%s", modelName),
-		"api_server = \\\"<API_SERVER>\\\"":                  fmt.Sprintf("api_server = \\\"%s\\\"", GetOpenShiftApiUrl(test)),
-		"token = \\\"<TOKEN>\\\"":                            fmt.Sprintf("token = \\\"%s\\\"", userToken),
-		"#configuration.verify_ssl = False":                  "configuration.verify_ssl = False",
-		"name=\\\"sft\\\"":                                   fmt.Sprintf("name=\\\"sft-%s\\\"", namespace.Name),
-		"\"HF_TOKEN\\\": \\\"\\\"":                           fmt.Sprintf("\"HF_TOKEN\\\": \\\"%s\\\"", hfToken),
-		"claim_name=\\\"shared\\\"":                          fmt.Sprintf("claim_name=\\\"%s\\\"", notebookPVC.Name),
-		"eval_strategy: epoch":                               "eval_strategy: 'no'",
-		"logging_steps: 1":                                   "logging_steps: 10",
-		"\"client.get_job_logs(\\n\",":                       "\"client.wait_for_job_conditions(\\n\",",
-		"\"    follow=True,\\n\",":                           "\"    wait_timeout=1800,\\n\",\n\t\"    polling_interval=60,\\n\",",
-		"os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]":          "#os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]",
-		"%load_ext tensorboard":                              "#%load_ext tensorboard",
-		"%tensorboard --logdir /opt/app-root/src/shared":     "#%tensorboard --logdir /opt/app-root/src/shared",
+		"num_train_epochs: 10": "num_train_epochs: 1",
+		"eval_strategy: epoch": "eval_strategy: 'no'",
+		"logging_steps: 1":     "logging_steps: 10",
+		"output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct":              fmt.Sprintf("output_dir: /mnt/shared/%s", modelName),
+		"api_server = \\\"<API_SERVER>\\\"":                               fmt.Sprintf("api_server = \\\"%s\\\"", GetOpenShiftApiUrl(test)),
+		"token = \\\"<TOKEN>\\\"":                                         fmt.Sprintf("token = \\\"%s\\\"", userToken),
+		"#configuration.verify_ssl = False":                               "configuration.verify_ssl = False",
+		"name=\\\"sft\\\"":                                                fmt.Sprintf("name=\\\"sft-%s\\\"", namespace.Name),
+		"train_func=main,":                                                fmt.Sprintf("labels= {\\n\",\n\t\"            \\\"kueue.x-k8s.io/queue-name\\\": \\\"%s\\\"\\n\",\n\t\"    },\\n\",\n\t\"    train_func=main,", localQueue.Name),
+		"        \\\"nvidia.com/gpu\\\"":                                  fmt.Sprintf("        \\\"%s\\\"", gpu.ResourceLabel),
+		"base_image=\\\"quay.io/modh/training:py311-cuda124-torch251\\\"": fmt.Sprintf("base_image=\\\"%s\\\"", image),
+		"\"HF_TOKEN\\\": \\\"\\\"":                                        fmt.Sprintf("\"HF_TOKEN\\\": \\\"%s\\\"", hfToken),
+		"claim_name=\\\"shared\\\"":                                       fmt.Sprintf("claim_name=\\\"%s\\\"", notebookPVC.Name),
+		"\"client.get_job_logs(\\n\",":                                    "\"client.wait_for_job_conditions(\\n\",",
+		"\"    follow=True,\\n\",":                                        "\"    wait_timeout=1800,\\n\",\n\t\"    polling_interval=60,\\n\",",
+		"os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]":                       "#os.environ[\\\"TENSORBOARD_PROXY_URL\\\"]",
+		"%load_ext tensorboard":                                           "#%load_ext tensorboard",
+		"%tensorboard --logdir /opt/app-root/src/shared":                  "#%tensorboard --logdir /opt/app-root/src/shared",
 		"pretrained_path = \\\"/opt/app-root/src/shared/.cache/hub/models--Meta-Llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/\\\"": "pretrained_path = \\\"/opt/app-root/src/.cache/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/\\\"",
 		"# Test the pre-trained model": "# Test the pre-trained model\\n\",\n\"from IPython.display import Markdown, display\\n\",\n\"import os",
 		"display(Markdown(output1))":   "display(Markdown(output1))\\n\",\n\"\\n\",\n\"# Save to file\\n\",\n\"output_path = \\\"/opt/app-root/src/pretrained_output.md\\\"\\n\",\n\"os.makedirs(os.path.dirname(output_path), exist_ok=True)\\n\",\n\"with open(output_path, \\\"w\\\") as f:\\n\",\n\t\"    f.write(output1)",
@@ -128,7 +173,7 @@ func kftoSftLlm(t *testing.T, modelName string) {
 	}
 
 	// Create Notebook CR
-	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, "sft.ipynb", 1, notebookPVC)
+	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, "sft.ipynb", 1, notebookPVC, ContainerSizeMedium, gpu.ResourceLabel)
 
 	// Gracefully cleanup Notebook
 	defer func() {
@@ -177,7 +222,7 @@ func kftoSftLlm(t *testing.T, modelName string) {
 			test.T().Logf("Notebook execution completed as indicated by marker file: '%s'", markerContent)
 			return true
 		} else if markerContent == "FAILURE" {
-			errMessage := fmt.Sprintf("Notebook execution failed as indicated by marker file: '%s'", markerContent)
+			errMessage := fmt.Sprintf("Notebook execution failed as indicated by marker file '%s': %s", markerPath, markerContent)
 			test.T().Errorf("%s", errMessage)
 			notebookExecutionFinalError = fmt.Errorf("%s", errMessage)
 			// Return true to stop Eventually polling, because a *final* state has been reached
diff --git a/tests/odh/mnist_ray_test.go b/tests/odh/mnist_ray_test.go
@@ -131,7 +131,7 @@ func mnistRay(t *testing.T, numGpus int, gpuResourceName string, rayImage string
 
 	notebookCommand := getNotebookCommand(rayImage)
 	// Create Notebook CR
-	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
+	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
 
 	// Gracefully cleanup Notebook
 	defer func() {
diff --git a/tests/odh/mnist_raytune_hpo_test.go b/tests/odh/mnist_raytune_hpo_test.go
@@ -114,7 +114,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
 	notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
 
 	// Create Notebook CR
-	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
+	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
 
 	// Gracefully cleanup Notebook
 	defer func() {
diff --git a/tests/odh/ray_finetune_llm_deepspeed_test.go b/tests/odh/ray_finetune_llm_deepspeed_test.go
@@ -113,7 +113,7 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
 	notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
 
 	// Create Notebook CR
-	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
+	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
 
 	// Gracefully cleanup Notebook
 	defer func() {
diff --git a/tests/odh/raytune_oai_mr_grpc_test.go b/tests/odh/raytune_oai_mr_grpc_test.go
@@ -123,7 +123,7 @@ func raytuneHpo(t *testing.T, numGpus int) {
 	notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", AccessModes(corev1.ReadWriteOnce))
 
 	// Create Notebook CR
-	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC)
+	CreateNotebook(test, namespace, userToken, notebookCommand, config.Name, jupyterNotebookConfigMapFileName, numGpus, notebookPVC, ContainerSizeSmall)
 
 	// Gracefully cleanup Notebook
 	defer func() {

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+FMS_HF_TUNING_IMAGE=quay.io/modh/fms-hf-tuning:release`
	`2`	`+NOTEBOOK_IMAGE=quay.io/modh/odh-workbench-jupyter-datascience-cpu-py311-ubi9:rhoai-2.22`