Skip to content

Commit 10c7292

Browse files
committed
remove slack cluster queue functionality
1 parent f459b91 commit 10c7292

File tree

11 files changed

+2
-321
lines changed

11 files changed

+2
-321
lines changed

Makefile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,6 @@ EXTERNAL_CRDS_DIR ?= $(shell pwd)/dep-crds
9999
KUEUE_ROOT = $(shell go list -m -mod=readonly -f "{{.Dir}}" sigs.k8s.io/kueue)
100100
.PHONY: dep-crds
101101
dep-crds: ## Copy CRDs from external operators to dep-crds directory.
102-
mkdir -p $(EXTERNAL_CRDS_DIR)/kueue
103-
cp -f $(KUEUE_ROOT)/config/components/crd/bases/* $(EXTERNAL_CRDS_DIR)/kueue
104102

105103
.PHONY: test
106104
test: manifests generate fmt vet dep-crds envtest ## Run unit tests.

cmd/main.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ import (
4747
"sigs.k8s.io/controller-runtime/pkg/webhook"
4848
"sigs.k8s.io/yaml"
4949

50-
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
51-
5250
awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
5351
"github.com/project-codeflare/appwrapper/internal/metrics"
5452
"github.com/project-codeflare/appwrapper/pkg/config"
@@ -66,7 +64,6 @@ var (
6664

6765
func init() {
6866
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
69-
utilruntime.Must(kueue.AddToScheme(scheme))
7067
utilruntime.Must(awv1beta2.AddToScheme(scheme))
7168
//+kubebuilder:scaffold:scheme
7269
}

config/rbac/role.yaml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,16 +105,6 @@ rules:
105105
- patch
106106
- update
107107
- watch
108-
- apiGroups:
109-
- kueue.x-k8s.io
110-
resources:
111-
- clusterqueues
112-
verbs:
113-
- get
114-
- list
115-
- patch
116-
- update
117-
- watch
118108
- apiGroups:
119109
- ray.io
120110
resources:

internal/controller/appwrapper/fixtures_test.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030
"k8s.io/apimachinery/pkg/types"
3131
"k8s.io/utils/ptr"
3232
"sigs.k8s.io/controller-runtime/pkg/client"
33-
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
3433
"sigs.k8s.io/yaml"
3534

3635
awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
@@ -217,16 +216,3 @@ func malformedPod(milliCPU int64) awv1beta2.AppWrapperComponent {
217216
Template: runtime.RawExtension{Raw: jsonBytes},
218217
}
219218
}
220-
221-
func slackQueue(queueName string, nominalQuota resource.Quantity) *kueue.ClusterQueue {
222-
return &kueue.ClusterQueue{
223-
TypeMeta: metav1.TypeMeta{APIVersion: kueue.GroupVersion.String(), Kind: "ClusterQueue"},
224-
ObjectMeta: metav1.ObjectMeta{Name: queueName},
225-
Spec: kueue.ClusterQueueSpec{
226-
ResourceGroups: []kueue.ResourceGroup{{
227-
CoveredResources: []v1.ResourceName{v1.ResourceName("nvidia.com/gpu")},
228-
Flavors: []kueue.FlavorQuotas{{
229-
Name: "default-flavor",
230-
Resources: []kueue.ResourceQuota{{Name: v1.ResourceName("nvidia.com/gpu"), NominalQuota: nominalQuota}}}}}}},
231-
}
232-
}

internal/controller/appwrapper/node_health_monitor.go

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ import (
2424
v1 "k8s.io/api/core/v1"
2525
"k8s.io/apimachinery/pkg/api/errors"
2626
"k8s.io/apimachinery/pkg/api/resource"
27-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2827
"k8s.io/apimachinery/pkg/util/sets"
2928

3029
ctrl "sigs.k8s.io/controller-runtime"
3130
"sigs.k8s.io/controller-runtime/pkg/client"
32-
"sigs.k8s.io/controller-runtime/pkg/event"
3331
"sigs.k8s.io/controller-runtime/pkg/handler"
3432
"sigs.k8s.io/controller-runtime/pkg/log"
3533

@@ -44,7 +42,6 @@ import (
4442
type NodeHealthMonitor struct {
4543
client.Client
4644
Config *config.AppWrapperConfig
47-
Events chan event.GenericEvent // event channel for NodeHealthMonitor to trigger SlackClusterQueueMonitor
4845
}
4946

5047
var (
@@ -85,16 +82,6 @@ func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ct
8582
return ctrl.Result{}, nil
8683
}
8784

88-
func (r *NodeHealthMonitor) triggerSlackCQMonitor() {
89-
if r.Config.SlackQueueName != "" {
90-
select {
91-
case r.Events <- event.GenericEvent{Object: &metav1.PartialObjectMetadata{ObjectMeta: metav1.ObjectMeta{Name: r.Config.SlackQueueName}}}:
92-
default:
93-
// do not block if event is already in channel
94-
}
95-
}
96-
}
97-
9885
// update noExecuteNodes and noScheduleNodes for the deletion of nodeName
9986
func (r *NodeHealthMonitor) updateForNodeDeletion(ctx context.Context, nodeName string) {
10087
if _, ok := noExecuteNodes[nodeName]; ok {
@@ -103,15 +90,13 @@ func (r *NodeHealthMonitor) updateForNodeDeletion(ctx context.Context, nodeName
10390
noExecuteNodesMutex.Unlock() // END CRITICAL SECTION
10491
log.FromContext(ctx).Info("Updated NoExecute information due to Node deletion",
10592
"Number NoExecute Nodes", len(noExecuteNodes), "NoExecute Resource Details", noExecuteNodes)
106-
r.triggerSlackCQMonitor()
10793
}
10894
if _, ok := noScheduleNodes[nodeName]; ok {
10995
noScheduleNodesMutex.Lock() // BEGIN CRITICAL SECTION
11096
delete(noScheduleNodes, nodeName)
11197
noScheduleNodesMutex.Unlock() // END CRITICAL SECTION
11298
log.FromContext(ctx).Info("Updated NoSchedule information due to Node deletion",
11399
"Number NoSchedule Nodes", len(noScheduleNodes), "NoSchedule Resource Details", noScheduleNodes)
114-
r.triggerSlackCQMonitor()
115100
}
116101
}
117102

@@ -146,7 +131,6 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
146131

147132
if noExecuteNodesChanged {
148133
log.FromContext(ctx).Info("Updated NoExecute information", "Number NoExecute Nodes", len(noExecuteNodes), "NoExecute Resource Details", noExecuteNodes)
149-
r.triggerSlackCQMonitor()
150134
}
151135
}
152136

@@ -192,7 +176,6 @@ func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.
192176

193177
if noScheduleNodesChanged {
194178
log.FromContext(ctx).Info("Updated NoSchedule information", "Number NoSchedule Nodes", len(noScheduleNodes), "NoSchedule Resource Details", noScheduleNodes)
195-
r.triggerSlackCQMonitor()
196179
}
197180
}
198181

internal/controller/appwrapper/node_health_monitor_test.go

Lines changed: 0 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,13 @@ import (
2424
"k8s.io/apimachinery/pkg/api/resource"
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2626
"k8s.io/apimachinery/pkg/types"
27-
"sigs.k8s.io/controller-runtime/pkg/event"
2827
"sigs.k8s.io/controller-runtime/pkg/reconcile"
2928
)
3029

3130
var _ = Describe("NodeMonitor Controller", func() {
32-
var slackQueueName = "fake-queue"
33-
var dispatch = types.NamespacedName{Name: slackQueueName}
3431
var node1Name = types.NamespacedName{Name: "fake-node-1"}
3532
var node2Name = types.NamespacedName{Name: "fake-node-2"}
3633
var nodeMonitor *NodeHealthMonitor
37-
var cqMonitor *SlackClusterQueueMonitor
3834
nodeGPUs := v1.ResourceList{v1.ResourceName("nvidia.com/gpu"): resource.MustParse("4")}
3935

4036
createNode := func(nodeName string) {
@@ -58,23 +54,14 @@ var _ = Describe("NodeMonitor Controller", func() {
5854
BeforeEach(func() {
5955
// Create reconcillers
6056
awConfig := config.NewAppWrapperConfig()
61-
awConfig.SlackQueueName = slackQueueName
62-
conduit := make(chan event.GenericEvent, 1)
6357
nodeMonitor = &NodeHealthMonitor{
6458
Client: k8sClient,
6559
Config: awConfig,
66-
Events: conduit,
67-
}
68-
cqMonitor = &SlackClusterQueueMonitor{
69-
Client: k8sClient,
70-
Config: awConfig,
71-
Events: conduit,
7260
}
7361
})
7462

7563
AfterEach(func() {
7664
nodeMonitor = nil
77-
cqMonitor = nil
7865
})
7966

8067
It("Autopilot Monitoring", func() {
@@ -120,113 +107,4 @@ var _ = Describe("NodeMonitor Controller", func() {
120107
deleteNode(node1Name.Name)
121108
deleteNode(node2Name.Name)
122109
})
123-
124-
It("ClusterQueue Lending Adjustment", func() {
125-
createNode(node1Name.Name)
126-
createNode(node2Name.Name)
127-
128-
_, err := nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
129-
Expect(err).NotTo(HaveOccurred())
130-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
131-
Expect(err).NotTo(HaveOccurred())
132-
133-
// start with 6 gpus
134-
queue := slackQueue(slackQueueName, resource.MustParse("6"))
135-
Expect(k8sClient.Create(ctx, queue)).To(Succeed())
136-
137-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
138-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())
139-
140-
// remove 4 gpus, lending limit should be 2
141-
node1 := getNode(node1Name.Name)
142-
node1.Labels["autopilot.ibm.com/gpuhealth"] = "EVICT"
143-
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
144-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
145-
Expect(err).NotTo(HaveOccurred())
146-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
147-
Expect(err).NotTo(HaveOccurred())
148-
149-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
150-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))
151-
152-
// remove another 4 gpus, lending limit should be 0 = max(0, 6-4-4)
153-
node2 := getNode(node2Name.Name)
154-
node2.Labels["autopilot.ibm.com/gpuhealth"] = "TESTING"
155-
Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
156-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
157-
Expect(err).NotTo(HaveOccurred())
158-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
159-
Expect(err).NotTo(HaveOccurred())
160-
161-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
162-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
163-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(0)))
164-
165-
// restore 4 gpus, lending limit should be 2
166-
node1.Labels["autopilot.ibm.com/gpuhealth"] = "OK"
167-
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
168-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
169-
Expect(err).NotTo(HaveOccurred())
170-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
171-
Expect(err).NotTo(HaveOccurred())
172-
173-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
174-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
175-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))
176-
177-
// restore last 4 gpus, lending limit should be nil
178-
node2.Labels["autopilot.ibm.com/gpuhealth"] = "OK"
179-
Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
180-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
181-
Expect(err).NotTo(HaveOccurred())
182-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
183-
Expect(err).NotTo(HaveOccurred())
184-
185-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
186-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())
187-
188-
// cordon node1, lending limit should be 2
189-
node1 = getNode(node1Name.Name)
190-
node1.Spec.Unschedulable = true
191-
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
192-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
193-
Expect(err).NotTo(HaveOccurred())
194-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
195-
Expect(err).NotTo(HaveOccurred())
196-
197-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
198-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))
199-
200-
// Increase the slack cluster queue's quota by 2 and expect LendngLimit to increase by 2 to become 4
201-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
202-
queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].NominalQuota = resource.MustParse("8")
203-
Expect(k8sClient.Update(ctx, queue)).Should(Succeed())
204-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: slackQueueName}})
205-
Expect(err).NotTo(HaveOccurred())
206-
207-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
208-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(4)))
209-
210-
// Deleting a noncordoned node should not change the lending limit
211-
deleteNode(node2Name.Name)
212-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
213-
Expect(err).NotTo(HaveOccurred())
214-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
215-
Expect(err).NotTo(HaveOccurred())
216-
217-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
218-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(4)))
219-
220-
// Delete the cordoned node; lending limit should now by nil
221-
deleteNode(node1Name.Name)
222-
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
223-
Expect(err).NotTo(HaveOccurred())
224-
_, err = cqMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: dispatch})
225-
Expect(err).NotTo(HaveOccurred())
226-
227-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
228-
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())
229-
230-
Expect(k8sClient.Delete(ctx, queue)).To(Succeed())
231-
})
232110
})

0 commit comments

Comments
 (0)