Skip to content

Commit 48d6a2f

Browse files
Use resource locker to perform a single operation on a device at a time
This patch makes use of the `ResourceLocker` instance to only perform a single operation on a network device, minimizing the rist for unexpected side-effects of multiple controllers performing configuration changes on a network device at the same time.
1 parent 5ad9797 commit 48d6a2f

30 files changed

+626
-47
lines changed

cmd/main.go

Lines changed: 67 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,17 @@ import (
1515
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
1616
// to ensure that exec-entrypoint and run can make use of them.
1717
_ "k8s.io/client-go/plugin/pkg/client/auth"
18-
"k8s.io/utils/ptr"
1918

2019
// Set runtime concurrency to match CPU limit imposed by Kubernetes
2120
_ "go.uber.org/automaxprocs"
2221

2322
"github.com/sapcc/go-api-declarations/bininfo"
2423
"go.uber.org/zap/zapcore"
24+
coordinationv1 "k8s.io/api/coordination/v1"
2525
"k8s.io/apimachinery/pkg/runtime"
2626
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2727
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
28+
"k8s.io/utils/ptr"
2829
ctrl "sigs.k8s.io/controller-runtime"
2930
"sigs.k8s.io/controller-runtime/pkg/cache"
3031
"sigs.k8s.io/controller-runtime/pkg/certwatcher"
@@ -44,6 +45,7 @@ import (
4445
nxcontroller "github.com/ironcore-dev/network-operator/internal/controller/cisco/nx"
4546
corecontroller "github.com/ironcore-dev/network-operator/internal/controller/core"
4647
"github.com/ironcore-dev/network-operator/internal/provider"
48+
"github.com/ironcore-dev/network-operator/internal/resourcelock"
4749
webhooknxv1alpha1 "github.com/ironcore-dev/network-operator/internal/webhook/cisco/nx/v1alpha1"
4850
webhookv1alpha1 "github.com/ironcore-dev/network-operator/internal/webhook/core/v1alpha1"
4951
// +kubebuilder:scaffold:imports
@@ -77,6 +79,9 @@ func main() {
7779
var providerName string
7880
var requeueInterval time.Duration
7981
var maxConcurrentReconciles int
82+
var lockerNamespace string
83+
var lockerDuration time.Duration
84+
var lockerRenewInterval time.Duration
8085
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
8186
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
8287
flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
@@ -92,6 +97,9 @@ func main() {
9297
flag.StringVar(&providerName, "provider", "openconfig", "The provider to use for the controller. If not specified, the default provider is used. Available providers: "+strings.Join(provider.Providers(), ", "))
9398
flag.DurationVar(&requeueInterval, "requeue-interval", 30*time.Second, "The interval after which Kubernetes resources should be reconciled again regardless of whether they have changed.")
9499
flag.IntVar(&maxConcurrentReconciles, "max-concurrent-reconciles", 1, "The maximum number of concurrent reconciles per controller. Defaults to 1.")
100+
flag.StringVar(&lockerNamespace, "locker-namespace", "", "The namespace to use for resource locker coordination. If not specified, uses the namespace the manager is deployed in, or 'default' if undetectable.")
101+
flag.DurationVar(&lockerDuration, "locker-duration", 10*time.Second, "The duration of the resource locker lease.")
102+
flag.DurationVar(&lockerRenewInterval, "locker-renew-interval", 5*time.Second, "The interval at which the resource locker lease is renewed.")
95103
opts := zap.Options{
96104
Development: true,
97105
TimeEncoder: zapcore.ISO8601TimeEncoder,
@@ -202,12 +210,7 @@ func main() {
202210
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
203211
// speeds up voluntary leader transitions as the new leader don't have to wait
204212
// LeaseDuration time first.
205-
//
206-
// In the default scaffold provided, the program ends immediately after
207-
// the manager stops, so would be fine to enable this option. However,
208-
// if you are doing or is intended to do any operation such as perform cleanups
209-
// after the manager stops then its usage might be unsafe.
210-
// LeaderElectionReleaseOnCancel: true,
213+
LeaderElectionReleaseOnCancel: true,
211214
})
212215
if err != nil {
213216
setupLog.Error(err, "unable to start manager")
@@ -223,6 +226,38 @@ func main() {
223226

224227
ctx := ctrl.SetupSignalHandler()
225228

229+
if lockerNamespace == "" {
230+
if ns, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"); err == nil {
231+
lockerNamespace = strings.TrimSpace(string(ns))
232+
setupLog.Info("Detected namespace from service account", "namespace", lockerNamespace)
233+
} else {
234+
lockerNamespace = "default"
235+
setupLog.Info("Using default namespace for resource locker", "namespace", lockerNamespace)
236+
}
237+
}
238+
239+
locker, err := resourcelock.NewResourceLocker(mgr.GetClient(), lockerNamespace, lockerDuration, lockerRenewInterval)
240+
if err != nil {
241+
setupLog.Error(err, "unable to create resource locker")
242+
os.Exit(1)
243+
}
244+
245+
// Set up cache informer for Lease resources used by ResourceLocker.
246+
// This ensures the cache has an informer for coordination.k8s.io/v1 Lease resources
247+
// before any controller tries to use the ResourceLocker, which is required when
248+
// ReaderFailOnMissingInformer is set to true.
249+
if _, err := mgr.GetCache().GetInformer(ctx, &coordinationv1.Lease{}); err != nil {
250+
setupLog.Error(err, "unable to get informer for Lease resources")
251+
os.Exit(1)
252+
}
253+
setupLog.Info("Lease cache informer initialized", "namespace", lockerNamespace)
254+
255+
// Add the ResourceLocker to the manager so it will be properly cleaned up on shutdown.
256+
if err := mgr.Add(locker); err != nil {
257+
setupLog.Error(err, "unable to add resource locker to manager")
258+
os.Exit(1)
259+
}
260+
226261
if err := (&corecontroller.DeviceReconciler{
227262
Client: mgr.GetClient(),
228263
Scheme: mgr.GetScheme(),
@@ -241,6 +276,7 @@ func main() {
241276
Recorder: mgr.GetEventRecorderFor("interface-controller"),
242277
WatchFilterValue: watchFilterValue,
243278
Provider: prov,
279+
Locker: locker,
244280
RequeueInterval: requeueInterval,
245281
}).SetupWithManager(ctx, mgr); err != nil {
246282
setupLog.Error(err, "unable to create controller", "controller", "Interface")
@@ -253,6 +289,7 @@ func main() {
253289
Recorder: mgr.GetEventRecorderFor("banner-controller"),
254290
WatchFilterValue: watchFilterValue,
255291
Provider: prov,
292+
Locker: locker,
256293
}).SetupWithManager(mgr); err != nil {
257294
setupLog.Error(err, "unable to create controller", "controller", "Banner")
258295
os.Exit(1)
@@ -264,6 +301,7 @@ func main() {
264301
Recorder: mgr.GetEventRecorderFor("user-controller"),
265302
WatchFilterValue: watchFilterValue,
266303
Provider: prov,
304+
Locker: locker,
267305
}).SetupWithManager(mgr); err != nil {
268306
setupLog.Error(err, "unable to create controller", "controller", "User")
269307
os.Exit(1)
@@ -275,6 +313,7 @@ func main() {
275313
Recorder: mgr.GetEventRecorderFor("dns-controller"),
276314
WatchFilterValue: watchFilterValue,
277315
Provider: prov,
316+
Locker: locker,
278317
}).SetupWithManager(mgr); err != nil {
279318
setupLog.Error(err, "unable to create controller", "controller", "DNS")
280319
os.Exit(1)
@@ -286,6 +325,7 @@ func main() {
286325
Recorder: mgr.GetEventRecorderFor("ntp-controller"),
287326
WatchFilterValue: watchFilterValue,
288327
Provider: prov,
328+
Locker: locker,
289329
}).SetupWithManager(mgr); err != nil {
290330
setupLog.Error(err, "unable to create controller", "controller", "NTP")
291331
os.Exit(1)
@@ -297,6 +337,7 @@ func main() {
297337
Recorder: mgr.GetEventRecorderFor("acl-controller"),
298338
WatchFilterValue: watchFilterValue,
299339
Provider: prov,
340+
Locker: locker,
300341
}).SetupWithManager(mgr); err != nil {
301342
setupLog.Error(err, "unable to create controller", "controller", "AccessControlList")
302343
os.Exit(1)
@@ -308,6 +349,7 @@ func main() {
308349
Recorder: mgr.GetEventRecorderFor("certificate-controller"),
309350
WatchFilterValue: watchFilterValue,
310351
Provider: prov,
352+
Locker: locker,
311353
}).SetupWithManager(mgr); err != nil {
312354
setupLog.Error(err, "unable to create controller", "controller", "Certificate")
313355
os.Exit(1)
@@ -319,6 +361,7 @@ func main() {
319361
Recorder: mgr.GetEventRecorderFor("snmp-controller"),
320362
WatchFilterValue: watchFilterValue,
321363
Provider: prov,
364+
Locker: locker,
322365
}).SetupWithManager(mgr); err != nil {
323366
setupLog.Error(err, "unable to create controller", "controller", "SNMP")
324367
os.Exit(1)
@@ -330,6 +373,7 @@ func main() {
330373
Recorder: mgr.GetEventRecorderFor("syslog-controller"),
331374
WatchFilterValue: watchFilterValue,
332375
Provider: prov,
376+
Locker: locker,
333377
}).SetupWithManager(mgr); err != nil {
334378
setupLog.Error(err, "unable to create controller", "controller", "Syslog")
335379
os.Exit(1)
@@ -341,6 +385,7 @@ func main() {
341385
Recorder: mgr.GetEventRecorderFor("managementaccess-controller"),
342386
WatchFilterValue: watchFilterValue,
343387
Provider: prov,
388+
Locker: locker,
344389
}).SetupWithManager(mgr); err != nil {
345390
setupLog.Error(err, "unable to create controller", "controller", "ManagementAccess")
346391
os.Exit(1)
@@ -352,6 +397,7 @@ func main() {
352397
Recorder: mgr.GetEventRecorderFor("isis-controller"),
353398
WatchFilterValue: watchFilterValue,
354399
Provider: prov,
400+
Locker: locker,
355401
RequeueInterval: requeueInterval,
356402
}).SetupWithManager(mgr); err != nil {
357403
setupLog.Error(err, "unable to create controller", "controller", "ISIS")
@@ -364,6 +410,7 @@ func main() {
364410
Recorder: mgr.GetEventRecorderFor("pim-controller"),
365411
WatchFilterValue: watchFilterValue,
366412
Provider: prov,
413+
Locker: locker,
367414
RequeueInterval: requeueInterval,
368415
}).SetupWithManager(mgr); err != nil {
369416
setupLog.Error(err, "unable to create controller", "controller", "PIM")
@@ -376,6 +423,7 @@ func main() {
376423
Recorder: mgr.GetEventRecorderFor("bgp-controller"),
377424
WatchFilterValue: watchFilterValue,
378425
Provider: prov,
426+
Locker: locker,
379427
RequeueInterval: requeueInterval,
380428
}).SetupWithManager(mgr); err != nil {
381429
setupLog.Error(err, "unable to create controller", "controller", "BGP")
@@ -388,6 +436,7 @@ func main() {
388436
Recorder: mgr.GetEventRecorderFor("bgppeer-controller"),
389437
WatchFilterValue: watchFilterValue,
390438
Provider: prov,
439+
Locker: locker,
391440
RequeueInterval: requeueInterval,
392441
}).SetupWithManager(mgr); err != nil {
393442
setupLog.Error(err, "unable to create controller", "controller", "BGPPeer")
@@ -400,6 +449,7 @@ func main() {
400449
Recorder: mgr.GetEventRecorderFor("ospf-controller"),
401450
WatchFilterValue: watchFilterValue,
402451
Provider: prov,
452+
Locker: locker,
403453
RequeueInterval: requeueInterval,
404454
}).SetupWithManager(mgr); err != nil {
405455
setupLog.Error(err, "unable to create controller", "controller", "OSPF")
@@ -412,6 +462,7 @@ func main() {
412462
Recorder: mgr.GetEventRecorderFor("vlan-controller"),
413463
WatchFilterValue: watchFilterValue,
414464
Provider: prov,
465+
Locker: locker,
415466
RequeueInterval: requeueInterval,
416467
}).SetupWithManager(mgr); err != nil {
417468
setupLog.Error(err, "unable to create controller", "controller", "VLAN")
@@ -424,6 +475,7 @@ func main() {
424475
Recorder: mgr.GetEventRecorderFor("vrf-controller"),
425476
WatchFilterValue: watchFilterValue,
426477
Provider: prov,
478+
Locker: locker,
427479
RequeueInterval: requeueInterval,
428480
}).SetupWithManager(mgr); err != nil {
429481
setupLog.Error(err, "unable to create controller", "controller", "VRF")
@@ -433,9 +485,10 @@ func main() {
433485
if err := (&nxcontroller.VPCDomainReconciler{
434486
Client: mgr.GetClient(),
435487
Scheme: mgr.GetScheme(),
436-
Recorder: mgr.GetEventRecorderFor("vpcdomain-controller"),
488+
Recorder: mgr.GetEventRecorderFor("cisco-nx-vpcdomain-controller"),
437489
WatchFilterValue: watchFilterValue,
438490
Provider: prov,
491+
Locker: locker,
439492
RequeueInterval: requeueInterval,
440493
}).SetupWithManager(ctx, mgr); err != nil {
441494
setupLog.Error(err, "unable to create controller", "controller", "VPCDomain")
@@ -448,6 +501,7 @@ func main() {
448501
Recorder: mgr.GetEventRecorderFor("nve-controller"),
449502
WatchFilterValue: watchFilterValue,
450503
Provider: prov,
504+
Locker: locker,
451505
RequeueInterval: requeueInterval,
452506
}).SetupWithManager(ctx, mgr); err != nil {
453507
setupLog.Error(err, "unable to create controller", "controller", "NetworkVirtualizationEdge")
@@ -460,6 +514,7 @@ func main() {
460514
Recorder: mgr.GetEventRecorderFor("cisco-nx-system-controller"),
461515
WatchFilterValue: watchFilterValue,
462516
Provider: prov,
517+
Locker: locker,
463518
}).SetupWithManager(mgr); err != nil {
464519
setupLog.Error(err, "unable to create controller", "controller", "System")
465520
os.Exit(1)
@@ -471,6 +526,7 @@ func main() {
471526
Recorder: mgr.GetEventRecorderFor("evpn-instance-controller"),
472527
WatchFilterValue: watchFilterValue,
473528
Provider: prov,
529+
Locker: locker,
474530
}).SetupWithManager(ctx, mgr); err != nil {
475531
setupLog.Error(err, "unable to create controller", "controller", "EVPNInstance")
476532
os.Exit(1)
@@ -482,6 +538,7 @@ func main() {
482538
Recorder: mgr.GetEventRecorderFor("prefixset-controller"),
483539
WatchFilterValue: watchFilterValue,
484540
Provider: prov,
541+
Locker: locker,
485542
}).SetupWithManager(mgr); err != nil {
486543
setupLog.Error(err, "unable to create controller", "controller", "PrefixSet")
487544
os.Exit(1)
@@ -493,6 +550,7 @@ func main() {
493550
Recorder: mgr.GetEventRecorderFor("routingpolicy-controller"),
494551
WatchFilterValue: watchFilterValue,
495552
Provider: prov,
553+
Locker: locker,
496554
}).SetupWithManager(ctx, mgr); err != nil {
497555
setupLog.Error(err, "unable to create controller", "controller", "RoutingPolicy")
498556
os.Exit(1)
@@ -504,6 +562,7 @@ func main() {
504562
Recorder: mgr.GetEventRecorderFor("cisco-nx-border-gateway-controller"),
505563
WatchFilterValue: watchFilterValue,
506564
Provider: prov,
565+
Locker: locker,
507566
}).SetupWithManager(ctx, mgr); err != nil {
508567
setupLog.Error(err, "unable to create controller", "controller", "BorderGateway")
509568
os.Exit(1)

config/develop/manager_patch.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
- --leader-elect=false
55
- --health-probe-bind-address=:8081
66
- --provider=openconfig
7-
- --requeue-interval=10s
7+
- --requeue-interval=15s
8+
- --max-concurrent-reconciles=5

internal/controller/cisco/nx/bordergateway_controller.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ package nx
55

66
import (
77
"context"
8+
"errors"
89
"fmt"
910
"slices"
11+
"time"
1012

1113
"k8s.io/apimachinery/pkg/api/equality"
1214
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -31,6 +33,7 @@ import (
3133
"github.com/ironcore-dev/network-operator/internal/deviceutil"
3234
"github.com/ironcore-dev/network-operator/internal/provider"
3335
"github.com/ironcore-dev/network-operator/internal/provider/cisco/nxos"
36+
"github.com/ironcore-dev/network-operator/internal/resourcelock"
3437
)
3538

3639
// BorderGatewayReconciler reconciles a BorderGateway object
@@ -47,6 +50,9 @@ type BorderGatewayReconciler struct {
4750

4851
// Provider is the driver that will be used to create & delete the bordergateway.
4952
Provider provider.ProviderFunc
53+
54+
// Locker is used to synchronize operations on resources targeting the same device.
55+
Locker *resourcelock.ResourceLocker
5056
}
5157

5258
// +kubebuilder:rbac:groups=nx.cisco.networking.metal.ironcore.dev,resources=bordergateways,verbs=get;list;watch;create;update;patch;delete
@@ -97,6 +103,21 @@ func (r *BorderGatewayReconciler) Reconcile(ctx context.Context, req ctrl.Reques
97103
return ctrl.Result{}, err
98104
}
99105

106+
if err := r.Locker.AcquireLock(ctx, device.Name, "cisco-nx-border-gateway-controller"); err != nil {
107+
if errors.Is(err, resourcelock.ErrLockAlreadyHeld) {
108+
log.Info("Device is already locked, requeuing reconciliation")
109+
return ctrl.Result{RequeueAfter: time.Second * 5}, nil
110+
}
111+
log.Error(err, "Failed to acquire device lock")
112+
return ctrl.Result{}, err
113+
}
114+
defer func() {
115+
if err := r.Locker.ReleaseLock(ctx, device.Name, "cisco-nx-border-gateway-controller"); err != nil {
116+
log.Error(err, "Failed to release device lock")
117+
reterr = kerrors.NewAggregate([]error{reterr, err})
118+
}
119+
}()
120+
100121
conn, err := deviceutil.GetDeviceConnection(ctx, r, device)
101122
if err != nil {
102123
return ctrl.Result{}, err

0 commit comments

Comments
 (0)