Skip to content

Commit eb91ade

Browse files
Separate the pytorchjob's validating admission policy test for kfto tests to be used in downstream tests
1 parent 97a7614 commit eb91ade

File tree

3 files changed

+261
-79
lines changed

3 files changed

+261
-79
lines changed

tests/kfto/support.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ import (
2020
"embed"
2121
"time"
2222

23+
gonanoid "github.com/matoous/go-nanoid/v2"
2324
"github.com/onsi/gomega"
2425
. "github.com/onsi/gomega"
2526
prometheusapiv1 "github.com/prometheus/client_golang/api/prometheus/v1"
2627
prometheusmodel "github.com/prometheus/common/model"
27-
2828
corev1 "k8s.io/api/core/v1"
2929

3030
"github.com/opendatahub-io/distributed-workloads/tests/common/support"
@@ -75,3 +75,9 @@ func withEnvVarName(name string) compare[corev1.EnvVar] {
7575
return e1.Name == name
7676
}
7777
}
78+
79+
// Adds a unique suffix to the provided string
80+
func uniqueSuffix(prefix string) string {
81+
suffix := gonanoid.MustGenerate("1234567890abcdef", 4)
82+
return prefix + "-" + suffix
83+
}
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package kfto
18+
19+
import (
20+
"encoding/json"
21+
"testing"
22+
23+
kftrainingv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
24+
. "github.com/onsi/gomega"
25+
26+
vapv1 "k8s.io/api/admissionregistration/v1"
27+
corev1 "k8s.io/api/core/v1"
28+
"k8s.io/apimachinery/pkg/api/resource"
29+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/apimachinery/pkg/types"
31+
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
32+
testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob"
33+
34+
. "github.com/opendatahub-io/distributed-workloads/tests/common"
35+
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
36+
)
37+
38+
// Note: This test must run on an OCP v4.17 or later cluster.
39+
// The Validating Admission Policy feature gate is GA and enabled by default from OCP v4.17 (k8s v1.30)
40+
41+
var (
42+
ns *corev1.Namespace
43+
nsNoLabel *corev1.Namespace
44+
rf *kueuev1beta1.ResourceFlavor
45+
cq *kueuev1beta1.ClusterQueue
46+
lq *kueuev1beta1.LocalQueue
47+
pyt *kftrainingv1.PyTorchJob
48+
vapb *vapv1.ValidatingAdmissionPolicyBinding
49+
vapbCopy *vapv1.ValidatingAdmissionPolicyBinding
50+
pytWithLQName = "pyt-with-lq"
51+
pytNoLQName = "pyt-no-lq"
52+
)
53+
54+
func TestValidatingAdmissionPolicy(t *testing.T) {
55+
test := With(t)
56+
57+
Tags(t, Sanity)
58+
59+
// Create a namespace
60+
ns = CreateTestNamespaceWithName(test, uniqueSuffix("vap"))
61+
defer test.Client().Core().CoreV1().Namespaces().Delete(test.Ctx(), ns.Name, metav1.DeleteOptions{})
62+
63+
// Get the latest version of the namespace
64+
ns, err := test.Client().Core().CoreV1().Namespaces().Get(test.Ctx(), ns.Name, metav1.GetOptions{})
65+
test.Expect(err).ToNot(HaveOccurred())
66+
67+
// Add the 'kueue.openshift.io/managed' label
68+
ns.Labels = map[string]string{
69+
"kueue.openshift.io/managed": "true",
70+
}
71+
_, err = test.Client().Core().CoreV1().Namespaces().Update(test.Ctx(), ns, metav1.UpdateOptions{})
72+
test.Expect(err).ToNot(HaveOccurred())
73+
74+
// Create a namespace that will not receive the `kueue.x-k8s.io/queue-name` label
75+
nsNoLabel = CreateTestNamespaceWithName(test, uniqueSuffix("vap-nl"))
76+
defer test.Client().Core().CoreV1().Namespaces().Delete(test.Ctx(), nsNoLabel.Name, metav1.DeleteOptions{})
77+
78+
// Create a resource flavor
79+
rf = CreateKueueResourceFlavor(test, kueuev1beta1.ResourceFlavorSpec{})
80+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), rf.Name, metav1.DeleteOptions{})
81+
82+
// Create a cluster queue
83+
cqSpec := kueuev1beta1.ClusterQueueSpec{
84+
NamespaceSelector: &metav1.LabelSelector{},
85+
ResourceGroups: []kueuev1beta1.ResourceGroup{
86+
{
87+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
88+
Flavors: []kueuev1beta1.FlavorQuotas{
89+
{
90+
Name: kueuev1beta1.ResourceFlavorReference(rf.Name),
91+
Resources: []kueuev1beta1.ResourceQuota{
92+
{
93+
Name: corev1.ResourceCPU,
94+
NominalQuota: resource.MustParse("3"),
95+
},
96+
{
97+
Name: corev1.ResourceMemory,
98+
NominalQuota: resource.MustParse("8Gi"),
99+
},
100+
{
101+
Name: corev1.ResourceName("nvidia.com/gpu"),
102+
NominalQuota: resource.MustParse("0"),
103+
},
104+
},
105+
},
106+
},
107+
},
108+
},
109+
}
110+
// Create a cluster queue
111+
cq = CreateKueueClusterQueue(test, cqSpec)
112+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), cq.Name, metav1.DeleteOptions{})
113+
114+
// Create a local queue
115+
lq = CreateKueueLocalQueue(test, ns.Name, cq.Name)
116+
defer test.Client().Kueue().KueueV1beta1().LocalQueues(ns.Name).Delete(test.Ctx(), lq.Name, metav1.DeleteOptions{})
117+
118+
// Snapshot the original ValidatingAdmissionPolicyBinding state
119+
vapb, err = test.Client().Core().AdmissionregistrationV1().ValidatingAdmissionPolicyBindings().Get(test.Ctx(), "kueue-validating-admission-policy-binding", metav1.GetOptions{})
120+
test.Expect(err).ToNot(HaveOccurred())
121+
122+
vapbCopy = vapb.DeepCopy()
123+
defer revertVAPB(test, vapbCopy)
124+
125+
/**************************************************************************
126+
Testing the default behavior with the ValidatingAdmissionPolicyBinding enforcement enabled.
127+
**************************************************************************/
128+
t.Run("Default ValidatingAdmissionPolicyBinding", func(t *testing.T) {
129+
t.Run("PyTorchJob Tests", func(t *testing.T) {
130+
t.Run("PyTorchJob should be admitted with the 'kueue.x-k8s.io/queue-name' label set", func(t *testing.T) {
131+
err = createPyTorchJobWithLocalQueue(test, ns.Name, lq.Name)
132+
test.Expect(err).ToNot(HaveOccurred())
133+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
134+
})
135+
t.Run("PyTorchJob should not be admitted without the 'kueue.x-k8s.io/queue-name' label set", func(t *testing.T) {
136+
err = createPyTorchJob(test, ns.Name)
137+
test.Expect(err).ToNot(BeNil())
138+
test.Expect(err.Error()).To(ContainSubstring("The label 'kueue.x-k8s.io/queue-name' is either missing or does not have a value set"))
139+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
140+
})
141+
})
142+
})
143+
144+
/**************************************************************************
145+
Testing the 1st alternative behavior with the ValidatingAdmissionPolicyBinding enforcement disabled.
146+
**************************************************************************/
147+
t.Run("Disable the ValidatingAdmissionPolicy enforcement", func(t *testing.T) {
148+
vapb, err := test.Client().Core().AdmissionregistrationV1().ValidatingAdmissionPolicyBindings().Get(test.Ctx(), vapb.Name, metav1.GetOptions{})
149+
test.Expect(err).ToNot(HaveOccurred())
150+
151+
vapb.Spec.PolicyName = "none"
152+
_, err = test.Client().Core().AdmissionregistrationV1().ValidatingAdmissionPolicyBindings().Update(test.Ctx(), vapb, metav1.UpdateOptions{})
153+
test.Expect(err).ToNot(HaveOccurred())
154+
defer revertVAPB(test, vapbCopy)
155+
156+
t.Run("PyTorchJob Tests", func(t *testing.T) {
157+
t.Run("PyTorchJob should be admitted with the 'kueue.x-k8s.io/queue-name' label set", func(t *testing.T) {
158+
err = createPyTorchJobWithLocalQueue(test, ns.Name, lq.Name)
159+
test.Expect(err).ToNot(HaveOccurred())
160+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
161+
})
162+
t.Run("PyTorchJob should be admitted without the 'kueue.x-k8s.io/queue-name' label set", func(t *testing.T) {
163+
err = createPyTorchJob(test, ns.Name)
164+
test.Expect(err).ToNot(HaveOccurred())
165+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
166+
})
167+
})
168+
})
169+
170+
/**************************************************************************
171+
Testing the 2nd alternative behavior which targets specific namespaces that have the 'kueue.openshift.io/managed' label
172+
**************************************************************************/
173+
t.Run("Custom ValidatingAdmissionPolicyBinding", func(t *testing.T) {
174+
// Apply the ValidatingAdmissionPolicyBinding targetting namespaces with the label 'kueue.openshift.io/managed'
175+
vapb, err = test.Client().Core().AdmissionregistrationV1().ValidatingAdmissionPolicyBindings().Get(test.Ctx(), vapb.Name, metav1.GetOptions{})
176+
test.Expect(err).ToNot(HaveOccurred())
177+
178+
vapb.Spec.MatchResources.NamespaceSelector.MatchLabels = map[string]string{"kueue.openshift.io/managed": "true"}
179+
_, err = test.Client().Core().AdmissionregistrationV1().ValidatingAdmissionPolicyBindings().Update(test.Ctx(), vapb, metav1.UpdateOptions{})
180+
test.Expect(err).ToNot(HaveOccurred())
181+
defer revertVAPB(test, vapbCopy)
182+
183+
t.Run("PyTorchJob Tests", func(t *testing.T) {
184+
t.Run("PyTorchJob should be admitted with the 'kueue.x-k8s.io/queue-name' label in a labeled namespace", func(t *testing.T) {
185+
err = createPyTorchJobWithLocalQueue(test, ns.Name, lq.Name)
186+
test.Expect(err).ToNot(HaveOccurred())
187+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
188+
})
189+
t.Run("PyTorchJob should not be admitted without the 'kueue.x-k8s.io/queue-name' label in a labeled namespace", func(t *testing.T) {
190+
err = createPyTorchJob(test, ns.Name)
191+
test.Expect(err).ToNot(BeNil())
192+
test.Expect(err.Error()).To(ContainSubstring("The label 'kueue.x-k8s.io/queue-name' is either missing or does not have a value set"))
193+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
194+
})
195+
t.Run("PyTorchJob should be admitted with the 'kueue.x-k8s.io/queue-name' label in any other namespace", func(t *testing.T) {
196+
err = createPyTorchJobWithLocalQueue(test, nsNoLabel.Name, lq.Name)
197+
test.Expect(err).ToNot(HaveOccurred())
198+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
199+
})
200+
t.Run("PyTorchJob should be admitted without the 'kueue.x-k8s.io/queue-name' label in any other namespace", func(t *testing.T) {
201+
err = createPyTorchJob(test, nsNoLabel.Name)
202+
test.Expect(err).ToNot(HaveOccurred())
203+
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(ns.Name).Delete(test.Ctx(), pyt.Name, metav1.DeleteOptions{})
204+
})
205+
})
206+
})
207+
}
208+
209+
// Revert validating-admission-policy-binding to its original state
210+
func revertVAPB(test Test, vapbCopy *vapv1.ValidatingAdmissionPolicyBinding) {
211+
patchBytes, _ := json.Marshal(map[string]interface{}{
212+
"spec": map[string]interface{}{
213+
"policyName": vapbCopy.Spec.PolicyName,
214+
"matchResources": map[string]interface{}{
215+
"namespaceSelector": map[string]interface{}{
216+
"matchLabels": vapbCopy.Spec.MatchResources.NamespaceSelector.MatchLabels,
217+
},
218+
},
219+
},
220+
})
221+
_, err := test.Client().Core().AdmissionregistrationV1().ValidatingAdmissionPolicyBindings().Patch(test.Ctx(), vapbCopy.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
222+
test.Expect(err).ToNot(HaveOccurred())
223+
}
224+
225+
func createPyTorchJob(test Test, namespaceName string) error {
226+
pyt = testingpytorchjob.MakePyTorchJob(uniqueSuffix(pytNoLQName), namespaceName).Obj()
227+
pyt.Spec.PyTorchReplicaSpecs[kftrainingv1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Name = "pytorch"
228+
pyt.Spec.PyTorchReplicaSpecs[kftrainingv1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Name = "pytorch"
229+
230+
_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespaceName).Create(test.Ctx(), pyt, metav1.CreateOptions{})
231+
return err
232+
}
233+
234+
func createPyTorchJobWithLocalQueue(test Test, namespaceName, localQueueName string) error {
235+
pyt = testingpytorchjob.MakePyTorchJob(uniqueSuffix(pytWithLQName), namespaceName).Queue(localQueueName).Obj()
236+
pyt.Spec.PyTorchReplicaSpecs[kftrainingv1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Name = "pytorch"
237+
pyt.Spec.PyTorchReplicaSpecs[kftrainingv1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Name = "pytorch"
238+
239+
_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespaceName).Create(test.Ctx(), pyt, metav1.CreateOptions{})
240+
return err
241+
}

0 commit comments

Comments
 (0)