@@ -24,17 +24,13 @@ import (
2424 "k8s.io/apimachinery/pkg/api/resource"
2525 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2626 "k8s.io/apimachinery/pkg/types"
27- "sigs.k8s.io/controller-runtime/pkg/event"
2827 "sigs.k8s.io/controller-runtime/pkg/reconcile"
2928)
3029
3130var _ = Describe ("NodeMonitor Controller" , func () {
32- var slackQueueName = "fake-queue"
33- var dispatch = types.NamespacedName {Name : slackQueueName }
3431 var node1Name = types.NamespacedName {Name : "fake-node-1" }
3532 var node2Name = types.NamespacedName {Name : "fake-node-2" }
3633 var nodeMonitor * NodeHealthMonitor
37- var cqMonitor * SlackClusterQueueMonitor
3834 nodeGPUs := v1.ResourceList {v1 .ResourceName ("nvidia.com/gpu" ): resource .MustParse ("4" )}
3935
4036 createNode := func (nodeName string ) {
@@ -58,23 +54,14 @@ var _ = Describe("NodeMonitor Controller", func() {
5854 BeforeEach (func () {
5955 // Create reconcillers
6056 awConfig := config .NewAppWrapperConfig ()
61- awConfig .SlackQueueName = slackQueueName
62- conduit := make (chan event.GenericEvent , 1 )
6357 nodeMonitor = & NodeHealthMonitor {
6458 Client : k8sClient ,
6559 Config : awConfig ,
66- Events : conduit ,
67- }
68- cqMonitor = & SlackClusterQueueMonitor {
69- Client : k8sClient ,
70- Config : awConfig ,
71- Events : conduit ,
7260 }
7361 })
7462
7563 AfterEach (func () {
7664 nodeMonitor = nil
77- cqMonitor = nil
7865 })
7966
8067 It ("Autopilot Monitoring" , func () {
@@ -120,113 +107,4 @@ var _ = Describe("NodeMonitor Controller", func() {
120107 deleteNode (node1Name .Name )
121108 deleteNode (node2Name .Name )
122109 })
123-
124- It ("ClusterQueue Lending Adjustment" , func () {
125- createNode (node1Name .Name )
126- createNode (node2Name .Name )
127-
128- _ , err := nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node1Name })
129- Expect (err ).NotTo (HaveOccurred ())
130- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node2Name })
131- Expect (err ).NotTo (HaveOccurred ())
132-
133- // start with 6 gpus
134- queue := slackQueue (slackQueueName , resource .MustParse ("6" ))
135- Expect (k8sClient .Create (ctx , queue )).To (Succeed ())
136-
137- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
138- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit ).Should (BeNil ())
139-
140- // remove 4 gpus, lending limit should be 2
141- node1 := getNode (node1Name .Name )
142- node1 .Labels ["autopilot.ibm.com/gpuhealth" ] = "EVICT"
143- Expect (k8sClient .Update (ctx , node1 )).Should (Succeed ())
144- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node1Name })
145- Expect (err ).NotTo (HaveOccurred ())
146- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
147- Expect (err ).NotTo (HaveOccurred ())
148-
149- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
150- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit .Value ()).Should (Equal (int64 (2 )))
151-
152- // remove another 4 gpus, lending limit should be 0 = max(0, 6-4-4)
153- node2 := getNode (node2Name .Name )
154- node2 .Labels ["autopilot.ibm.com/gpuhealth" ] = "TESTING"
155- Expect (k8sClient .Update (ctx , node2 )).Should (Succeed ())
156- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node2Name })
157- Expect (err ).NotTo (HaveOccurred ())
158- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
159- Expect (err ).NotTo (HaveOccurred ())
160-
161- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
162- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit ).ShouldNot (BeNil ())
163- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit .Value ()).Should (Equal (int64 (0 )))
164-
165- // restore 4 gpus, lending limit should be 2
166- node1 .Labels ["autopilot.ibm.com/gpuhealth" ] = "OK"
167- Expect (k8sClient .Update (ctx , node1 )).Should (Succeed ())
168- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node1Name })
169- Expect (err ).NotTo (HaveOccurred ())
170- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
171- Expect (err ).NotTo (HaveOccurred ())
172-
173- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
174- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit ).ShouldNot (BeNil ())
175- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit .Value ()).Should (Equal (int64 (2 )))
176-
177- // restore last 4 gpus, lending limit should be nil
178- node2 .Labels ["autopilot.ibm.com/gpuhealth" ] = "OK"
179- Expect (k8sClient .Update (ctx , node2 )).Should (Succeed ())
180- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node2Name })
181- Expect (err ).NotTo (HaveOccurred ())
182- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
183- Expect (err ).NotTo (HaveOccurred ())
184-
185- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
186- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit ).Should (BeNil ())
187-
188- // cordon node1, lending limit should be 2
189- node1 = getNode (node1Name .Name )
190- node1 .Spec .Unschedulable = true
191- Expect (k8sClient .Update (ctx , node1 )).Should (Succeed ())
192- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node1Name })
193- Expect (err ).NotTo (HaveOccurred ())
194- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
195- Expect (err ).NotTo (HaveOccurred ())
196-
197- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
198- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit .Value ()).Should (Equal (int64 (2 )))
199-
200- // Increase the slack cluster queue's quota by 2 and expect LendngLimit to increase by 2 to become 4
201- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
202- queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].NominalQuota = resource .MustParse ("8" )
203- Expect (k8sClient .Update (ctx , queue )).Should (Succeed ())
204- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : types.NamespacedName {Name : slackQueueName }})
205- Expect (err ).NotTo (HaveOccurred ())
206-
207- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
208- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit .Value ()).Should (Equal (int64 (4 )))
209-
210- // Deleting a noncordoned node should not change the lending limit
211- deleteNode (node2Name .Name )
212- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node2Name })
213- Expect (err ).NotTo (HaveOccurred ())
214- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
215- Expect (err ).NotTo (HaveOccurred ())
216-
217- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
218- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit .Value ()).Should (Equal (int64 (4 )))
219-
220- // Delete the cordoned node; lending limit should now by nil
221- deleteNode (node1Name .Name )
222- _ , err = nodeMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : node1Name })
223- Expect (err ).NotTo (HaveOccurred ())
224- _ , err = cqMonitor .Reconcile (ctx , reconcile.Request {NamespacedName : dispatch })
225- Expect (err ).NotTo (HaveOccurred ())
226-
227- Expect (k8sClient .Get (ctx , types.NamespacedName {Name : slackQueueName }, queue )).Should (Succeed ())
228- Expect (queue .Spec .ResourceGroups [0 ].Flavors [0 ].Resources [0 ].LendingLimit ).Should (BeNil ())
229-
230- Expect (k8sClient .Delete (ctx , queue )).To (Succeed ())
231- })
232110})
0 commit comments