Skip to content

Commit 083084b

Browse files
Add kueue integration tests for Trainer v2
1 parent 495a1ee commit 083084b

File tree

10 files changed

+859
-145
lines changed

10 files changed

+859
-145
lines changed

go.mod

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,28 @@ module github.com/opendatahub-io/distributed-workloads
22

33
go 1.24.2
44

5+
// Remove replace when Training operator v1 tests are removed
6+
replace (
7+
github.com/kubeflow/training-operator => github.com/kubeflow/training-operator v1.7.0
8+
sigs.k8s.io/kueue => sigs.k8s.io/kueue v0.6.2
9+
)
10+
511
require (
612
github.com/kubeflow/trainer/v2 v2.0.0
713
github.com/kubeflow/training-operator v1.7.0
814
github.com/matoous/go-nanoid/v2 v2.1.0
9-
github.com/onsi/gomega v1.37.0
10-
github.com/openshift/api v0.0.0-20251015095338-264e80a2b6e7
15+
github.com/onsi/gomega v1.38.2
16+
github.com/openshift/api v0.0.0-20251124165233-999c45c0835a
1117
github.com/openshift/client-go v0.0.0-20251015124057-db0dee36e235
12-
github.com/prometheus/client_golang v1.22.0
13-
github.com/prometheus/common v0.62.0
18+
github.com/openshift/kueue-operator v0.0.0-20251202204851-958c48004dad
19+
github.com/prometheus/client_golang v1.23.0
20+
github.com/prometheus/common v0.65.0
1421
github.com/ray-project/kuberay/ray-operator v1.3.0
1522
k8s.io/api v0.34.1
1623
k8s.io/apimachinery v0.34.1
1724
k8s.io/client-go v0.34.1
18-
sigs.k8s.io/kueue v0.6.2
25+
sigs.k8s.io/jobset v0.9.1
26+
sigs.k8s.io/kueue v0.10.2
1927
)
2028

2129
require (
@@ -25,9 +33,9 @@ require (
2533
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
2634
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
2735
github.com/go-logr/logr v1.4.3 // indirect
28-
github.com/go-openapi/jsonpointer v0.21.0 // indirect
36+
github.com/go-openapi/jsonpointer v0.21.1 // indirect
2937
github.com/go-openapi/jsonreference v0.21.0 // indirect
30-
github.com/go-openapi/swag v0.23.0 // indirect
38+
github.com/go-openapi/swag v0.23.1 // indirect
3139
github.com/gogo/protobuf v1.3.2 // indirect
3240
github.com/google/gnostic-models v0.7.0 // indirect
3341
github.com/google/go-cmp v0.7.0 // indirect
@@ -41,23 +49,20 @@ require (
4149
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4250
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
4351
github.com/pkg/errors v0.9.1 // indirect
44-
github.com/prometheus/client_model v0.6.1 // indirect
45-
github.com/prometheus/procfs v0.15.1 // indirect
52+
github.com/prometheus/client_model v0.6.2 // indirect
53+
github.com/prometheus/procfs v0.16.1 // indirect
4654
github.com/sirupsen/logrus v1.9.3 // indirect
4755
github.com/spf13/pflag v1.0.6 // indirect
4856
github.com/x448/float16 v0.8.4 // indirect
49-
go.etcd.io/etcd/client/pkg/v3 v3.5.21 // indirect
50-
go.etcd.io/etcd/client/v3 v3.5.21 // indirect
51-
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 // indirect
5257
go.yaml.in/yaml/v2 v2.4.2 // indirect
5358
go.yaml.in/yaml/v3 v3.0.4 // indirect
54-
golang.org/x/net v0.38.0 // indirect
55-
golang.org/x/oauth2 v0.27.0 // indirect
56-
golang.org/x/sys v0.32.0 // indirect
57-
golang.org/x/term v0.30.0 // indirect
58-
golang.org/x/text v0.23.0 // indirect
59-
golang.org/x/time v0.10.0 // indirect
60-
google.golang.org/protobuf v1.36.5 // indirect
59+
golang.org/x/net v0.47.0 // indirect
60+
golang.org/x/oauth2 v0.30.0 // indirect
61+
golang.org/x/sys v0.38.0 // indirect
62+
golang.org/x/term v0.37.0 // indirect
63+
golang.org/x/text v0.31.0 // indirect
64+
golang.org/x/time v0.11.0 // indirect
65+
google.golang.org/protobuf v1.36.7 // indirect
6166
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
6267
gopkg.in/inf.v0 v0.9.1 // indirect
6368
gopkg.in/yaml.v2 v2.4.0 // indirect
@@ -66,10 +71,8 @@ require (
6671
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect
6772
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
6873
sigs.k8s.io/controller-runtime v0.21.0 // indirect
69-
sigs.k8s.io/jobset v0.8.2 // indirect
7074
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
7175
sigs.k8s.io/randfill v1.0.0 // indirect
72-
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
7376
sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
7477
sigs.k8s.io/yaml v1.6.0 // indirect
7578
)

go.sum

Lines changed: 101 additions & 98 deletions
Large diffs are not rendered by default.

tests/common/support/client.go

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
imagev1 "github.com/openshift/client-go/image/clientset/versioned"
3434
machinev1 "github.com/openshift/client-go/machine/clientset/versioned"
3535
routev1 "github.com/openshift/client-go/route/clientset/versioned"
36+
kueueoperatorclient "github.com/openshift/kueue-operator/pkg/generated/clientset/versioned"
3637
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
3738
// to ensure that exec-entrypoint and run can make use of them.
3839
)
@@ -43,6 +44,7 @@ type Client interface {
4344
Kubeflow() kubeflowclient.Interface
4445
JobSet() jobsetclient.Interface
4546
Kueue() kueueclient.Interface
47+
KueueOperator() kueueoperatorclient.Interface
4648
Machine() machinev1.Interface
4749
Route() routev1.Interface
4850
Image() imagev1.Interface
@@ -52,17 +54,18 @@ type Client interface {
5254
}
5355

5456
type testClient struct {
55-
core kubernetes.Interface
56-
trainer trainerclient.Interface
57-
kubeflow kubeflowclient.Interface
58-
jobset jobsetclient.Interface
59-
kueue kueueclient.Interface
60-
machine machinev1.Interface
61-
route routev1.Interface
62-
image imagev1.Interface
63-
ray rayclient.Interface
64-
dynamic dynamic.Interface
65-
storage storageclient.StorageV1Interface
57+
core kubernetes.Interface
58+
trainer trainerclient.Interface
59+
kubeflow kubeflowclient.Interface
60+
jobset jobsetclient.Interface
61+
kueue kueueclient.Interface
62+
kueueOperator kueueoperatorclient.Interface
63+
machine machinev1.Interface
64+
route routev1.Interface
65+
image imagev1.Interface
66+
ray rayclient.Interface
67+
dynamic dynamic.Interface
68+
storage storageclient.StorageV1Interface
6669
}
6770

6871
var _ Client = (*testClient)(nil)
@@ -87,6 +90,10 @@ func (t *testClient) Kueue() kueueclient.Interface {
8790
return t.kueue
8891
}
8992

93+
func (t *testClient) KueueOperator() kueueoperatorclient.Interface {
94+
return t.kueueOperator
95+
}
96+
9097
func (t *testClient) Machine() machinev1.Interface {
9198
return t.machine
9299
}
@@ -148,6 +155,11 @@ func newTestClient(cfg *rest.Config) (Client, *rest.Config, error) {
148155
return nil, nil, err
149156
}
150157

158+
kueueOperatorClient, err := kueueoperatorclient.NewForConfig(cfg)
159+
if err != nil {
160+
return nil, nil, err
161+
}
162+
151163
machineClient, err := machinev1.NewForConfig(cfg)
152164
if err != nil {
153165
return nil, nil, err
@@ -179,16 +191,17 @@ func newTestClient(cfg *rest.Config) (Client, *rest.Config, error) {
179191
}
180192

181193
return &testClient{
182-
core: kubeClient,
183-
trainer: trainerClient,
184-
kubeflow: kubeflowClient,
185-
jobset: jobsetClient,
186-
kueue: kueueClient,
187-
machine: machineClient,
188-
route: routeClient,
189-
image: imageClient,
190-
ray: rayClient,
191-
dynamic: dynamicClient,
192-
storage: storageClient,
194+
core: kubeClient,
195+
trainer: trainerClient,
196+
kubeflow: kubeflowClient,
197+
jobset: jobsetClient,
198+
kueue: kueueClient,
199+
kueueOperator: kueueOperatorClient,
200+
machine: machineClient,
201+
route: routeClient,
202+
image: imageClient,
203+
ray: rayClient,
204+
dynamic: dynamicClient,
205+
storage: storageClient,
193206
}, cfg, nil
194207
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package support
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
"github.com/onsi/gomega"
25+
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
28+
"k8s.io/apimachinery/pkg/runtime/schema"
29+
"k8s.io/client-go/dynamic"
30+
)
31+
32+
var DscGVR = schema.GroupVersionResource{
33+
Group: "datasciencecluster.opendatahub.io",
34+
Version: "v2",
35+
Resource: "datascienceclusters",
36+
}
37+
38+
func getDSC(dynamicClient dynamic.Interface, ctx context.Context, name string) (*unstructured.Unstructured, error) {
39+
return dynamicClient.Resource(DscGVR).Get(ctx, name, metav1.GetOptions{})
40+
}
41+
42+
func updateDSC(dynamicClient dynamic.Interface, ctx context.Context, dsc *unstructured.Unstructured) (*unstructured.Unstructured, error) {
43+
return dynamicClient.Resource(DscGVR).Update(ctx, dsc, metav1.UpdateOptions{})
44+
}
45+
46+
func setComponentState(dynamicClient dynamic.Interface, ctx context.Context, dscName, component, state string) error {
47+
dsc, err := getDSC(dynamicClient, ctx, dscName)
48+
if err != nil {
49+
return err
50+
}
51+
52+
err = unstructured.SetNestedField(dsc.Object, state, "spec", "components", component, "managementState")
53+
if err != nil {
54+
return err
55+
}
56+
57+
_, err = updateDSC(dynamicClient, ctx, dsc)
58+
return err
59+
}
60+
61+
func WaitForComponentState(dynamicClient dynamic.Interface, ctx context.Context, dscName, component, expectedState string, timeout time.Duration) error {
62+
deadline := time.Now().Add(timeout)
63+
for time.Now().Before(deadline) {
64+
dsc, err := getDSC(dynamicClient, ctx, dscName)
65+
if err != nil {
66+
return err
67+
}
68+
currentState := ComponentStatusManagementState(dsc, component)
69+
if currentState == expectedState {
70+
return nil
71+
}
72+
time.Sleep(2 * time.Second)
73+
}
74+
return fmt.Errorf("timeout waiting for component %s to reach state %s", component, expectedState)
75+
}
76+
77+
func SetComponentStateAndWait(dynamicClient dynamic.Interface, ctx context.Context, dscName, component, state string, timeout time.Duration) error {
78+
if err := setComponentState(dynamicClient, ctx, dscName, component, state); err != nil {
79+
return err
80+
}
81+
return WaitForComponentState(dynamicClient, ctx, dscName, component, state, timeout)
82+
}
83+
84+
func GetDSC(test Test, name string) (*unstructured.Unstructured, error) {
85+
return getDSC(test.Client().Dynamic(), test.Ctx(), name)
86+
}
87+
88+
func SetComponentToUnmanaged(test Test, dscName string, component string) error {
89+
return setComponentState(test.Client().Dynamic(), test.Ctx(), dscName, component, "Unmanaged")
90+
}
91+
92+
func DSCResource(test Test, name string) func(g gomega.Gomega) *unstructured.Unstructured {
93+
return func(g gomega.Gomega) *unstructured.Unstructured {
94+
dsc, err := GetDSC(test, name)
95+
g.Expect(err).NotTo(gomega.HaveOccurred())
96+
return dsc
97+
}
98+
}
99+
100+
func ComponentStatusManagementState(dsc *unstructured.Unstructured, component string) string {
101+
state, found, err := unstructured.NestedString(dsc.Object, "status", "components", component, "managementState")
102+
if err != nil || !found {
103+
return ""
104+
}
105+
return state
106+
}
107+
108+
func ComponentConditionStatus(dsc *unstructured.Unstructured, conditionType string) string {
109+
conditions, found, err := unstructured.NestedSlice(dsc.Object, "status", "conditions")
110+
if err != nil || !found {
111+
return ""
112+
}
113+
for _, c := range conditions {
114+
condition, ok := c.(map[string]interface{})
115+
if !ok {
116+
continue
117+
}
118+
if condition["type"] == conditionType {
119+
if status, ok := condition["status"].(string); ok {
120+
return status
121+
}
122+
}
123+
}
124+
return ""
125+
}

tests/common/support/kueue.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,21 @@ func KueueWorkloadAdmitted(workload *kueuev1beta1.Workload) bool {
129129
}
130130
return false
131131
}
132+
133+
func KueueWorkloadEvicted(workload *kueuev1beta1.Workload) bool {
134+
for _, v := range workload.Status.Conditions {
135+
if v.Type == "Evicted" && v.Status == "True" {
136+
return true
137+
}
138+
}
139+
return false
140+
}
141+
142+
func KueueWorkloadInadmissible(workload *kueuev1beta1.Workload) (bool, string) {
143+
for _, v := range workload.Status.Conditions {
144+
if v.Type == "QuotaReserved" && v.Status == "False" && v.Reason == "Inadmissible" {
145+
return true, v.Message
146+
}
147+
}
148+
return false, ""
149+
}

0 commit comments

Comments
 (0)