Skip to content

Commit 6a06231

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Implement custom test tags for ODH KFTO tests
1 parent b3d17f0 commit 6a06231

9 files changed

+216
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
* `CODEFLARE_TEST_TIMEOUT_LONG` - Timeout duration for long tasks
2525
* `CODEFLARE_TEST_RAY_IMAGE` (Optional) - Ray image used for raycluster configuration
2626
* `MINIO_CLI_IMAGE` (Optional) - Minio CLI image used for uploading/downloading data from/into s3 bucket
27+
* `TEST_TIER` (Optional) - Specifies test tier to run, skipping tests which don't belong to specified test tier. Supported test tiers: Smoke, Sanity, Tier1, Tier2, Tier3, Pre-Upgrade and Post-Upgrade.
2728

2829
NOTE: `quay.io/modh/ray:2.35.0-py311-cu121` is the default image used for creating a RayCluster resource. If you have your own custom ray image which suits your purposes, specify it in `CODEFLARE_TEST_RAY_IMAGE` environment variable.
2930

tests/common/environment.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package common
1818

1919
import (
2020
"os"
21+
"slices"
2122

2223
. "github.com/project-codeflare/codeflare-common/support"
2324
)
@@ -31,8 +32,24 @@ const (
3132
notebookUserToken = "NOTEBOOK_USER_TOKEN"
3233
// Image of the Notebook
3334
notebookImage = "NOTEBOOK_IMAGE"
35+
// Test tier to be invoked
36+
testTierEnvVar = "TEST_TIER"
3437
)
3538

39+
const (
40+
tierSmoke = "Smoke"
41+
tierSanity = "Sanity"
42+
tier1 = "Tier1"
43+
tier2 = "Tier2"
44+
tier3 = "Tier3"
45+
preUpgrade = "Pre-Upgrade"
46+
postUpgrade = "Post-Upgrade"
47+
kftoCuda = "KFTO-CUDA"
48+
kftoRocm = "KFTO-ROCm"
49+
)
50+
51+
var testTiers = []string{tierSmoke, tierSanity, tier1, tier2, tier3, preUpgrade, postUpgrade, kftoCuda, kftoRocm}
52+
3653
func GetOpenDataHubNamespace(t Test) string {
3754
ns, ok := os.LookupEnv(odhNamespaceEnvVar)
3855
if !ok {
@@ -64,3 +81,14 @@ func GetNotebookImage(t Test) string {
6481
}
6582
return notebook_image
6683
}
84+
85+
func GetTestTier(t Test) (string, bool) {
86+
tt, ok := os.LookupEnv(testTierEnvVar)
87+
if ok {
88+
if slices.Contains(testTiers, tt) {
89+
return tt, true
90+
}
91+
t.T().Fatalf("Environment variable %s is defined and contains invalid value: '%s'. Valid values are: %v", testTierEnvVar, tt, testTiers)
92+
}
93+
return "", false
94+
}

tests/common/test_tag.go

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"fmt"
21+
"testing"
22+
23+
. "github.com/onsi/gomega"
24+
. "github.com/project-codeflare/codeflare-common/support"
25+
26+
corev1 "k8s.io/api/core/v1"
27+
"k8s.io/apimachinery/pkg/api/resource"
28+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
)
30+
31+
func Tags(t *testing.T, tags ...func(test Test) (runTest bool, skipReason string)) {
32+
test := With(t)
33+
for _, tag := range tags {
34+
runTest, skipReason := tag(test)
35+
if !runTest {
36+
test.T().Skip(skipReason)
37+
}
38+
}
39+
}
40+
41+
// Test tag list
42+
43+
var Smoke = func(test Test) (runTest bool, skipReason string) {
44+
return testTier(test, tierSmoke)
45+
}
46+
47+
var Sanity = func(test Test) (runTest bool, skipReason string) {
48+
return testTier(test, tierSanity)
49+
}
50+
51+
var Tier1 = func(test Test) (runTest bool, skipReason string) {
52+
return testTier(test, tier1)
53+
}
54+
55+
var Tier2 = func(test Test) (runTest bool, skipReason string) {
56+
return testTier(test, tier2)
57+
}
58+
59+
var Tier3 = func(test Test) (runTest bool, skipReason string) {
60+
return testTier(test, tier3)
61+
}
62+
63+
var PreUpgrade = func(test Test) (runTest bool, skipReason string) {
64+
return mandatoryTestTier(test, preUpgrade)
65+
}
66+
67+
var PostUpgrade = func(test Test) (runTest bool, skipReason string) {
68+
return mandatoryTestTier(test, postUpgrade)
69+
}
70+
71+
var KftoCuda = func(test Test) (runTest bool, skipReason string) {
72+
return testTier(test, kftoCuda)
73+
}
74+
75+
var KftoRocm = func(test Test) (runTest bool, skipReason string) {
76+
return testTier(test, kftoRocm)
77+
}
78+
79+
func Gpu(accelerator Accelerator) func(test Test) (runTest bool, skipReason string) {
80+
return func(test Test) (runTest bool, skipReason string) {
81+
return isGpuCountAvailableForNodes(test, 1, accelerator.ResourceLabel, 1)
82+
}
83+
}
84+
85+
func MultiGpu(accelerator Accelerator, numberOfGpus int) func(test Test) (runTest bool, skipReason string) {
86+
return func(test Test) (runTest bool, skipReason string) {
87+
return isGpuCountAvailableForNodes(test, 1, accelerator.ResourceLabel, numberOfGpus)
88+
}
89+
}
90+
91+
func MultiNode(numberOfNodes int) func(test Test) (runTest bool, skipReason string) {
92+
return func(test Test) (runTest bool, skipReason string) {
93+
nodes, err := test.Client().Core().CoreV1().Nodes().List(test.Ctx(), v1.ListOptions{LabelSelector: "node-role.kubernetes.io/worker"})
94+
test.Expect(err).NotTo(HaveOccurred())
95+
96+
if len(nodes.Items) < numberOfNodes {
97+
return false, fmt.Sprintf("Detected number of nodes is %d, which is lower than expected %d.", len(nodes.Items), numberOfNodes)
98+
}
99+
return true, ""
100+
}
101+
}
102+
103+
func MultiNodeGpu(numberOfNodes int, accelerator Accelerator) func(test Test) (runTest bool, skipReason string) {
104+
return func(test Test) (runTest bool, skipReason string) {
105+
return isGpuCountAvailableForNodes(test, numberOfNodes, accelerator.ResourceLabel, 1)
106+
}
107+
}
108+
109+
func MultiNodeMultiGpu(numberOfNodes int, accelerator Accelerator, numberOfGpus int) func(test Test) (runTest bool, skipReason string) {
110+
return func(test Test) (runTest bool, skipReason string) {
111+
return isGpuCountAvailableForNodes(test, numberOfNodes, accelerator.ResourceLabel, numberOfGpus)
112+
}
113+
}
114+
115+
// util functions
116+
117+
func testTier(test Test, expectedTestTier string) (runTest bool, skipReason string) {
118+
actualTestTier, found := GetTestTier(test)
119+
if !found || actualTestTier == expectedTestTier {
120+
return true, ""
121+
}
122+
return false, fmt.Sprintf("Test tier '%s' doesn't match expected tier '%s'", actualTestTier, expectedTestTier)
123+
}
124+
125+
func mandatoryTestTier(test Test, expectedTestTier string) (runTest bool, skipReason string) {
126+
actualTestTier, found := GetTestTier(test)
127+
if found && actualTestTier == expectedTestTier {
128+
return true, ""
129+
}
130+
return false, fmt.Sprintf("Test tier '%s' doesn't match expected tier '%s'", actualTestTier, expectedTestTier)
131+
}
132+
133+
func isGpuCountAvailableForNodes(test Test, expectedNodes int, gpuResourceName string, expectedGpus int) (runTest bool, skipReason string) {
134+
nodes, err := test.Client().Core().CoreV1().Nodes().List(test.Ctx(), v1.ListOptions{LabelSelector: "node-role.kubernetes.io/worker"})
135+
test.Expect(err).NotTo(HaveOccurred())
136+
137+
var gpuNodes []corev1.Node
138+
for _, node := range nodes.Items {
139+
if node.Status.Allocatable.Name(corev1.ResourceName(gpuResourceName), resource.DecimalSI).Value() != 0 {
140+
gpuNodes = append(gpuNodes, node)
141+
}
142+
}
143+
144+
if len(gpuNodes) < expectedNodes {
145+
return false, fmt.Sprintf("Detected number of nodes with resource '%s' is %d, which is lower than expected %d.", gpuResourceName, len(gpuNodes), expectedNodes)
146+
}
147+
148+
for _, gpuNode := range gpuNodes {
149+
gpuCount := int(gpuNode.Status.Allocatable.Name(corev1.ResourceName(gpuResourceName), resource.DecimalSI).Value())
150+
151+
if gpuCount < expectedGpus {
152+
return false, fmt.Sprintf("Detected number of GPUs for nodes with resource '%s' is %d, which is lower than expected %d.", gpuResourceName, gpuCount, expectedGpus)
153+
}
154+
}
155+
return true, ""
156+
}

tests/kfto/kfto_kueue_mnist_upgrade_training_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ import (
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3030
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
3131
kueueacv1beta1 "sigs.k8s.io/kueue/client-go/applyconfiguration/kueue/v1beta1"
32+
33+
. "github.com/opendatahub-io/distributed-workloads/tests/common"
3234
)
3335

3436
var (
@@ -40,6 +42,7 @@ var (
4042
)
4143

4244
func TestSetupPytorchjob(t *testing.T) {
45+
Tags(t, PreUpgrade)
4346
test := With(t)
4447

4548
createOrGetUpgradeTestNamespace(test, namespaceName)
@@ -102,6 +105,7 @@ func TestSetupPytorchjob(t *testing.T) {
102105
}
103106

104107
func TestRunPytorchjob(t *testing.T) {
108+
Tags(t, PostUpgrade)
105109
test := With(t)
106110
namespace := GetNamespaceWithName(test, namespaceName)
107111

tests/kfto/kfto_mnist_sdk_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
)
3131

3232
func TestMnistSDK(t *testing.T) {
33+
Tags(t, Tier1)
3334
test := With(t)
3435
// Create a namespace
3536
namespace := test.NewTestNamespace()

tests/kfto/kfto_mnist_training_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,28 +28,36 @@ import (
2828
corev1 "k8s.io/api/core/v1"
2929
"k8s.io/apimachinery/pkg/api/resource"
3030
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+
32+
. "github.com/opendatahub-io/distributed-workloads/tests/common"
3133
)
3234

3335
func TestPyTorchJobMnistMultiNodeSingleCpu(t *testing.T) {
36+
Tags(t, Sanity, MultiNode(3))
3437
runKFTOPyTorchMnistJob(t, CPU, GetCudaTrainingImage(), "resources/requirements.txt", 2, 1)
3538
}
3639
func TestPyTorchJobMnistMultiNodeMultiCpu(t *testing.T) {
40+
Tags(t, Tier1, MultiNode(3))
3741
runKFTOPyTorchMnistJob(t, CPU, GetCudaTrainingImage(), "resources/requirements.txt", 2, 2)
3842
}
3943

4044
func TestPyTorchJobMnistMultiNodeSingleGpuWithCuda(t *testing.T) {
45+
Tags(t, KftoCuda)
4146
runKFTOPyTorchMnistJob(t, NVIDIA, GetCudaTrainingImage(), "resources/requirements.txt", 1, 1)
4247
}
4348

4449
func TestPyTorchJobMnistMultiNodeMultiGpuWithCuda(t *testing.T) {
50+
Tags(t, KftoCuda)
4551
runKFTOPyTorchMnistJob(t, NVIDIA, GetCudaTrainingImage(), "resources/requirements.txt", 1, 2)
4652
}
4753

4854
func TestPyTorchJobMnistMultiNodeSingleGpuWithROCm(t *testing.T) {
55+
Tags(t, KftoRocm)
4956
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 1)
5057
}
5158

5259
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm(t *testing.T) {
60+
Tags(t, KftoRocm)
5361
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 2)
5462
}
5563

tests/kfto/kfto_pytorchjob_failed_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,17 @@ import (
1010
corev1 "k8s.io/api/core/v1"
1111
"k8s.io/apimachinery/pkg/api/resource"
1212
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
14+
. "github.com/opendatahub-io/distributed-workloads/tests/common"
1315
)
1416

1517
func TestPyTorchJobFailureWithCuda(t *testing.T) {
18+
Tags(t, Tier1)
1619
runFailedPyTorchJobTest(t, GetCudaTrainingImage())
1720
}
1821

1922
func TestPyTorchJobFailureWithROCm(t *testing.T) {
23+
Tags(t, Tier1)
2024
runFailedPyTorchJobTest(t, GetROCmTrainingImage())
2125
}
2226

tests/kfto/kfto_training_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,37 +28,47 @@ import (
2828
corev1 "k8s.io/api/core/v1"
2929
"k8s.io/apimachinery/pkg/api/resource"
3030
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+
32+
. "github.com/opendatahub-io/distributed-workloads/tests/common"
3133
)
3234

3335
func TestPyTorchJobSingleNodeSingleGpuWithCuda(t *testing.T) {
36+
Tags(t, Tier1, Gpu(NVIDIA))
3437
runKFTOPyTorchJob(t, GetCudaTrainingImage(), NVIDIA, 1, 0)
3538
}
3639

3740
func TestPyTorchJobSingleNodeMultiGpuWithCuda(t *testing.T) {
41+
Tags(t, KftoCuda)
3842
runKFTOPyTorchJob(t, GetCudaTrainingImage(), NVIDIA, 2, 0)
3943
}
4044

4145
func TestPyTorchJobMultiNodeSingleGpuWithCuda(t *testing.T) {
46+
Tags(t, KftoCuda)
4247
runKFTOPyTorchJob(t, GetCudaTrainingImage(), NVIDIA, 1, 1)
4348
}
4449

4550
func TestPyTorchJobMultiNodeMultiGpuWithCuda(t *testing.T) {
51+
Tags(t, KftoCuda)
4652
runKFTOPyTorchJob(t, GetCudaTrainingImage(), NVIDIA, 2, 1)
4753
}
4854

4955
func TestPyTorchJobSingleNodeSingleGpuWithROCm(t *testing.T) {
56+
Tags(t, Tier1, Gpu(AMD))
5057
runKFTOPyTorchJob(t, GetROCmTrainingImage(), AMD, 1, 0)
5158
}
5259

5360
func TestPyTorchJobSingleNodeMultiGpuWithROCm(t *testing.T) {
61+
Tags(t, KftoRocm)
5462
runKFTOPyTorchJob(t, GetROCmTrainingImage(), AMD, 2, 0)
5563
}
5664

5765
func TestPyTorchJobMultiNodeSingleGpuWithROCm(t *testing.T) {
66+
Tags(t, KftoRocm)
5867
runKFTOPyTorchJob(t, GetROCmTrainingImage(), AMD, 1, 1)
5968
}
6069

6170
func TestPyTorchJobMultiNodeMultiGpuWithROCm(t *testing.T) {
71+
Tags(t, KftoRocm)
6272
runKFTOPyTorchJob(t, GetROCmTrainingImage(), AMD, 2, 1)
6373
}
6474

tests/kfto/kfto_upgrade_sleep_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
corev1 "k8s.io/api/core/v1"
2727
"k8s.io/apimachinery/pkg/api/errors"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
30+
. "github.com/opendatahub-io/distributed-workloads/tests/common"
2931
)
3032

3133
var (
@@ -34,6 +36,7 @@ var (
3436
)
3537

3638
func TestSetupSleepPytorchjob(t *testing.T) {
39+
Tags(t, PreUpgrade)
3740
test := With(t)
3841

3942
// Create a namespace
@@ -48,6 +51,7 @@ func TestSetupSleepPytorchjob(t *testing.T) {
4851
}
4952

5053
func TestVerifySleepPytorchjob(t *testing.T) {
54+
Tags(t, PostUpgrade)
5155
test := With(t)
5256
namespace := GetNamespaceWithName(test, sleepNamespaceName)
5357

0 commit comments

Comments
 (0)