Skip to content

Commit 5102a97

Browse files
Merge branch 'skyler/ref-reconcile' into 'main'
Reconcile Controller On NodeSet Update Closes #123 See merge request SchedMD/slinky-dev/slurm-operator!235
2 parents 3b739dc + 7ff9b93 commit 5102a97

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+3112
-658
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ golangci-lint: golangci-lint-bin ## Run golangci-lint.
297297
golangci-lint-fmt: golangci-lint-bin ## Run golangci-lint fmt.
298298
$(GOLANGCI_LINT) fmt
299299

300-
CODECOV_PERCENT ?= 67
300+
CODECOV_PERCENT ?= 66
301301

302302
.PHONY: test
303303
test: envtest ## Run tests.

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Run [Slurm] on [Kubernetes], by [SchedMD]. A [Slinky] project.
2020
- [Overview](#overview)
2121
- [Slurm Cluster](#slurm-cluster)
2222
- [Features](#features)
23+
- [Controller](#controller)
2324
- [NodeSets](#nodesets)
2425
- [LoginSets](#loginsets)
2526
- [Hybrid Support](#hybrid-support)
@@ -68,6 +69,21 @@ For additional information about Slurm, see the [slurm][slurm-docs] docs.
6869

6970
## Features
7071

72+
### Controller
73+
74+
The Slurm control-plane is responsible for scheduling Slurm workload onto its
75+
worker nodes and managing their states.
76+
77+
Changes to the Slurm configuration files are automatically detected and the
78+
Slurm cluster is reconfigured seamlessly with zero downtime of the Slurm
79+
control-plane.
80+
81+
> [!NOTE]
82+
> The kubelet's `configMapAndSecretChangeDetectionStrategy` and `syncFrequency`
83+
> settings directly affect when pods have their mounted ConfigMaps and Secrets
84+
> updated. By default, the kubelet is in `Watch` mode with a polling frequency
85+
> of 60 seconds.
86+
7187
### NodeSets
7288

7389
A set of homogeneous Slurm workers (compute nodes), which are delegated to

docs/index.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Table of Contents
2727

2828
- `Features <#features>`__
2929

30+
- `Controller <#controller>`__
3031
- `NodeSets <#nodesets>`__
3132
- `LoginSets <#loginsets>`__
3233
- `Hybrid Support <#hybrid-support>`__
@@ -98,6 +99,21 @@ For additional information about Slurm, see the
9899
Features
99100
--------
100101

102+
Controller
103+
~~~~~~~~~~
104+
105+
The Slurm control-plane is responsible for scheduling Slurm workload
106+
onto its worker nodes and managing their states.
107+
108+
Changes to the Slurm configuration files are automatically detected and
109+
the Slurm cluster is reconfigured seamlessly with zero downtime of the
110+
Slurm control-plane.
111+
112+
[!NOTE] The kubelet’s ``configMapAndSecretChangeDetectionStrategy``
113+
and ``syncFrequency`` settings directly affect when pods have their
114+
mounted ConfigMaps and Secrets updated. By default, the kubelet is in
115+
``Watch`` mode with a polling frequency of 60 seconds.
116+
101117
NodeSets
102118
~~~~~~~~
103119

internal/controller/accounting/accounting_controller.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
slinkyv1beta1 "github.com/SlinkyProject/slurm-operator/api/v1beta1"
2525
"github.com/SlinkyProject/slurm-operator/internal/builder"
26+
"github.com/SlinkyProject/slurm-operator/internal/controller/accounting/eventhandler"
2627
"github.com/SlinkyProject/slurm-operator/internal/utils/durationstore"
2728
"github.com/SlinkyProject/slurm-operator/internal/utils/refresolver"
2829
)
@@ -111,13 +112,8 @@ func (r *AccountingReconciler) SetupWithManager(mgr ctrl.Manager) error {
111112
Owns(&corev1.Service{}).
112113
Owns(&corev1.ConfigMap{}).
113114
Owns(&corev1.Secret{}).
114-
Watches(&slinkyv1beta1.Accounting{}, &accountingEventHandler{
115-
Reader: r.Client,
116-
refResolver: r.refResolver,
117-
}).
118-
Watches(&corev1.Secret{}, &secretEventHandler{
119-
Reader: r.Client,
120-
}).
115+
Watches(&slinkyv1beta1.Accounting{}, eventhandler.NewAccountingEventHandler(r.Client)).
116+
Watches(&corev1.Secret{}, eventhandler.NewSecretEventHandler(r.Client)).
121117
WithOptions(controller.Options{
122118
MaxConcurrentReconciles: maxConcurrentReconciles,
123119
}).

internal/controller/accounting/accounting_controller_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,21 @@ var _ = Describe("Accounting controller", func() {
4949
accountingKey := client.ObjectKeyFromObject(accounting)
5050
Eventually(func(g Gomega) {
5151
g.Expect(k8sClient.Get(ctx, accountingKey, createdAccounting)).To(Succeed())
52-
}).Should(Succeed())
52+
}, testutils.Timeout, testutils.Internal).Should(Succeed())
5353

5454
By("Expecting Accounting CR Service")
5555
serviceKey := accounting.ServiceKey()
5656
service := &corev1.Service{}
5757
Eventually(func(g Gomega) {
5858
g.Expect(k8sClient.Get(ctx, serviceKey, service)).To(Succeed())
59-
}).Should(Succeed())
59+
}, testutils.Timeout, testutils.Internal).Should(Succeed())
6060

6161
By("Expecting Accounting CR Statefulset")
6262
statefulsetKey := accounting.Key()
6363
statefulset := &appsv1.StatefulSet{}
6464
Eventually(func(g Gomega) {
6565
g.Expect(k8sClient.Get(ctx, statefulsetKey, statefulset)).To(Succeed())
66-
}).Should(Succeed())
66+
}, testutils.Timeout, testutils.Internal).Should(Succeed())
6767
}, SpecTimeout(testutils.Timeout))
6868
})
6969
})
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// SPDX-FileCopyrightText: Copyright (C) SchedMD LLC.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package eventhandler
5+
6+
import (
7+
"context"
8+
9+
"k8s.io/client-go/util/workqueue"
10+
"sigs.k8s.io/controller-runtime/pkg/client"
11+
"sigs.k8s.io/controller-runtime/pkg/event"
12+
"sigs.k8s.io/controller-runtime/pkg/handler"
13+
"sigs.k8s.io/controller-runtime/pkg/log"
14+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
15+
16+
slinkyv1beta1 "github.com/SlinkyProject/slurm-operator/api/v1beta1"
17+
"github.com/SlinkyProject/slurm-operator/internal/utils/objectutils"
18+
"github.com/SlinkyProject/slurm-operator/internal/utils/refresolver"
19+
)
20+
21+
func NewAccountingEventHandler(reader client.Reader) *AccountingEventHandler {
22+
return &AccountingEventHandler{
23+
Reader: reader,
24+
refResolver: refresolver.New(reader),
25+
}
26+
}
27+
28+
var _ handler.EventHandler = &AccountingEventHandler{}
29+
30+
type AccountingEventHandler struct {
31+
client.Reader
32+
refResolver *refresolver.RefResolver
33+
}
34+
35+
func (e *AccountingEventHandler) Create(
36+
ctx context.Context,
37+
evt event.CreateEvent,
38+
q workqueue.TypedRateLimitingInterface[reconcile.Request],
39+
) {
40+
e.enqueueRequest(ctx, evt.Object, q)
41+
}
42+
43+
func (e *AccountingEventHandler) Update(
44+
ctx context.Context,
45+
evt event.UpdateEvent,
46+
q workqueue.TypedRateLimitingInterface[reconcile.Request],
47+
) {
48+
e.enqueueRequest(ctx, evt.ObjectNew, q)
49+
}
50+
51+
func (e *AccountingEventHandler) Delete(
52+
ctx context.Context,
53+
evt event.DeleteEvent,
54+
q workqueue.TypedRateLimitingInterface[reconcile.Request],
55+
) {
56+
e.enqueueRequest(ctx, evt.Object, q)
57+
}
58+
59+
func (e *AccountingEventHandler) Generic(
60+
ctx context.Context,
61+
evt event.GenericEvent,
62+
q workqueue.TypedRateLimitingInterface[reconcile.Request],
63+
) {
64+
// Intentionally blank
65+
}
66+
67+
func (e *AccountingEventHandler) enqueueRequest(
68+
ctx context.Context,
69+
obj client.Object,
70+
q workqueue.TypedRateLimitingInterface[reconcile.Request],
71+
) {
72+
logger := log.FromContext(ctx)
73+
74+
accounting, ok := obj.(*slinkyv1beta1.Accounting)
75+
if !ok {
76+
return
77+
}
78+
79+
list, err := e.refResolver.GetControllersForAccounting(ctx, accounting)
80+
if err != nil {
81+
logger.Error(err, "failed to list Controllers referencing Accounting")
82+
return
83+
}
84+
85+
for _, item := range list.Items {
86+
objectutils.EnqueueRequest(q, &item)
87+
}
88+
}

internal/controller/accounting/accounting_eventhandler_test.go renamed to internal/controller/accounting/eventhandler/eventhandler_accounting_test.go

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
// SPDX-FileCopyrightText: Copyright (C) SchedMD LLC.
22
// SPDX-License-Identifier: Apache-2.0
33

4-
package accounting
4+
package eventhandler
55

66
import (
77
"context"
88
"testing"
99

1010
slinkyv1beta1 "github.com/SlinkyProject/slurm-operator/api/v1beta1"
11-
"github.com/SlinkyProject/slurm-operator/internal/utils/refresolver"
1211
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1312
"k8s.io/client-go/util/workqueue"
1413
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -17,18 +16,7 @@ import (
1716
"sigs.k8s.io/controller-runtime/pkg/reconcile"
1817
)
1918

20-
func newHandler(c client.Client) *accountingEventHandler {
21-
return &accountingEventHandler{
22-
Reader: c,
23-
refResolver: refresolver.New(c),
24-
}
25-
}
26-
27-
func newQueue() workqueue.TypedRateLimitingInterface[reconcile.Request] {
28-
return workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[reconcile.Request]())
29-
}
30-
31-
func Test_controllerEventHandler_Create(t *testing.T) {
19+
func Test_AccountingEventHandler_Create(t *testing.T) {
3220
type fields struct {
3321
client client.Client
3422
}
@@ -87,7 +75,7 @@ func Test_controllerEventHandler_Create(t *testing.T) {
8775
}
8876
for _, tt := range tests {
8977
t.Run(tt.name, func(t *testing.T) {
90-
e := newHandler(tt.fields.client)
78+
e := NewAccountingEventHandler(tt.fields.client)
9179
e.Create(tt.args.ctx, tt.args.evt, tt.args.q)
9280
if got := tt.args.q.Len(); got > tt.want {
9381
t.Errorf("Create() = %v, want %v", got, tt.want)
@@ -96,7 +84,7 @@ func Test_controllerEventHandler_Create(t *testing.T) {
9684
}
9785
}
9886

99-
func Test_controllerEventHandler_Update(t *testing.T) {
87+
func Test_AccountingEventHandler_Update(t *testing.T) {
10088
type fields struct {
10189
client client.Client
10290
}
@@ -160,7 +148,7 @@ func Test_controllerEventHandler_Update(t *testing.T) {
160148
}
161149
for _, tt := range tests {
162150
t.Run(tt.name, func(t *testing.T) {
163-
e := newHandler(tt.fields.client)
151+
e := NewAccountingEventHandler(tt.fields.client)
164152
e.Update(tt.args.ctx, tt.args.evt, tt.args.q)
165153
if got := tt.args.q.Len(); got > tt.want {
166154
t.Errorf("Create() = %v, want %v", got, tt.want)
@@ -169,7 +157,7 @@ func Test_controllerEventHandler_Update(t *testing.T) {
169157
}
170158
}
171159

172-
func Test_controllerEventHandler_Delete(t *testing.T) {
160+
func Test_AccountingEventHandler_Delete(t *testing.T) {
173161
type fields struct {
174162
client client.Client
175163
}
@@ -228,7 +216,7 @@ func Test_controllerEventHandler_Delete(t *testing.T) {
228216
}
229217
for _, tt := range tests {
230218
t.Run(tt.name, func(t *testing.T) {
231-
e := newHandler(tt.fields.client)
219+
e := NewAccountingEventHandler(tt.fields.client)
232220
e.Delete(tt.args.ctx, tt.args.evt, tt.args.q)
233221
if got := tt.args.q.Len(); got > tt.want {
234222
t.Errorf("Create() = %v, want %v", got, tt.want)
@@ -237,7 +225,7 @@ func Test_controllerEventHandler_Delete(t *testing.T) {
237225
}
238226
}
239227

240-
func Test_controllerEventHandler_Generic(t *testing.T) {
228+
func Test_AccountingEventHandler_Generic(t *testing.T) {
241229
type fields struct {
242230
client client.Client
243231
}
@@ -267,7 +255,7 @@ func Test_controllerEventHandler_Generic(t *testing.T) {
267255
}
268256
for _, tt := range tests {
269257
t.Run(tt.name, func(t *testing.T) {
270-
e := newHandler(tt.fields.client)
258+
e := NewAccountingEventHandler(tt.fields.client)
271259
e.Generic(tt.args.ctx, tt.args.evt, tt.args.q)
272260
if got := tt.args.q.Len(); got > tt.want {
273261
t.Errorf("Create() = %v, want %v", got, tt.want)

0 commit comments

Comments
 (0)