Skip to content

Commit 1edb63f

Browse files
authored
fix: make ipamv2 metrics resilient to missing custom resource definitions (#3029)
Signed-off-by: Evan Baker <[email protected]>
1 parent ffcac52 commit 1edb63f

File tree

4 files changed

+117
-62
lines changed

4 files changed

+117
-62
lines changed

cns/ipampool/metrics/observer.go

Lines changed: 96 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
1111
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
1212
"github.com/pkg/errors"
13+
"golang.org/x/sync/errgroup"
1314
)
1415

1516
// Subnet ARM ID /subscriptions/$(SUB)/resourceGroups/$(GROUP)/providers/Microsoft.Network/virtualNetworks/$(VNET)/subnets/$(SUBNET)
@@ -46,14 +47,22 @@ type metaState struct {
4647
subnetCIDR string
4748
}
4849

50+
type observer struct {
51+
ipSrc func() map[string]cns.IPConfigurationStatus
52+
nncSrc func(context.Context) (*v1alpha.NodeNetworkConfig, error)
53+
cssSrc func(context.Context) ([]v1alpha1.ClusterSubnetState, error)
54+
}
55+
4956
// NewLegacyMetricsObserver creates a closed functional scope which can be invoked to
5057
// observe the legacy IPAM pool metrics.
5158
//
5259
//nolint:lll // ignore line length
53-
func NewLegacyMetricsObserver(ctx context.Context, ipcli func() map[string]cns.IPConfigurationStatus, nnccli func(context.Context) (*v1alpha.NodeNetworkConfig, error), csscli func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) func() error {
54-
return func() error {
55-
return observeMetrics(ctx, ipcli, nnccli, csscli)
56-
}
60+
func NewLegacyMetricsObserver(ipSrc func() map[string]cns.IPConfigurationStatus, nncSrc func(context.Context) (*v1alpha.NodeNetworkConfig, error), cssSrc func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) func(context.Context) error {
61+
return (&observer{
62+
ipSrc: ipSrc,
63+
nncSrc: nncSrc,
64+
cssSrc: cssSrc,
65+
}).observeMetrics
5766
}
5867

5968
// generateARMID uses the Subnet ARM ID format to populate the ARM ID with the metadata.
@@ -73,68 +82,98 @@ func generateARMID(nc *v1alpha.NetworkContainer) string {
7382
// observeMetrics observes the IP pool and updates the metrics. Blocking.
7483
//
7584
//nolint:lll // ignore line length
76-
func observeMetrics(ctx context.Context, ipcli func() map[string]cns.IPConfigurationStatus, nnccli func(context.Context) (*v1alpha.NodeNetworkConfig, error), csscli func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) error {
77-
csslist, err := csscli(ctx)
78-
if err != nil {
79-
return err
80-
}
81-
nnc, err := nnccli(ctx)
82-
if err != nil {
83-
return err
84-
}
85-
ips := ipcli()
85+
func (o *observer) observeMetrics(ctx context.Context) error {
86+
// The error group is used to allow individual metrics sources to fail without
87+
// failing out the entire attempt to observe the Pool. This may happen if there is a
88+
// transient issue with the source of the data, or if the source is not available
89+
// (like if the CRD is not installed).
90+
var g errgroup.Group
8691

92+
// Get the current state of world.
8793
var meta metaState
88-
for i := range csslist {
89-
if csslist[i].Status.Exhausted {
90-
meta.exhausted = true
91-
break
92-
}
93-
}
94-
if len(nnc.Status.NetworkContainers) > 0 {
95-
// Set SubnetName, SubnetAddressSpace and Pod Network ARM ID values to the global subnet, subnetCIDR and subnetARM variables.
96-
meta.subnet = nnc.Status.NetworkContainers[0].SubnetName
97-
meta.subnetCIDR = nnc.Status.NetworkContainers[0].SubnetAddressSpace
98-
meta.subnetARMID = generateARMID(&nnc.Status.NetworkContainers[0])
99-
}
100-
meta.primaryIPAddresses = make(map[string]struct{})
101-
// Add Primary IP to Map, if not present.
102-
// This is only for Swift i.e. if NC Type is vnet.
103-
for i := 0; i < len(nnc.Status.NetworkContainers); i++ {
104-
nc := nnc.Status.NetworkContainers[i]
105-
if nc.Type == "" || nc.Type == v1alpha.VNET {
106-
meta.primaryIPAddresses[nc.PrimaryIP] = struct{}{}
94+
g.Go(func() error {
95+
// Try to fetch the ClusterSubnetState, if available.
96+
if o.cssSrc != nil {
97+
csslist, err := o.cssSrc(ctx)
98+
if err != nil {
99+
return err
100+
}
101+
for i := range csslist {
102+
if csslist[i].Status.Exhausted {
103+
meta.exhausted = true
104+
break
105+
}
106+
}
107107
}
108+
return nil
109+
})
108110

109-
if nc.Type == v1alpha.VNETBlock {
110-
primaryPrefix, err := netip.ParsePrefix(nc.PrimaryIP)
111+
var state ipPoolState
112+
g.Go(func() error {
113+
// Try to fetch the NodeNetworkConfig, if available.
114+
if o.nncSrc != nil {
115+
nnc, err := o.nncSrc(ctx)
111116
if err != nil {
112-
return errors.Wrapf(err, "unable to parse ip prefix: %s", nc.PrimaryIP)
117+
return err
118+
}
119+
if len(nnc.Status.NetworkContainers) > 0 {
120+
// Set SubnetName, SubnetAddressSpace and Pod Network ARM ID values to the global subnet, subnetCIDR and subnetARM variables.
121+
meta.subnet = nnc.Status.NetworkContainers[0].SubnetName
122+
meta.subnetCIDR = nnc.Status.NetworkContainers[0].SubnetAddressSpace
123+
meta.subnetARMID = generateARMID(&nnc.Status.NetworkContainers[0])
124+
}
125+
meta.primaryIPAddresses = make(map[string]struct{})
126+
// Add Primary IP to Map, if not present.
127+
// This is only for Swift i.e. if NC Type is vnet.
128+
for i := 0; i < len(nnc.Status.NetworkContainers); i++ {
129+
nc := nnc.Status.NetworkContainers[i]
130+
if nc.Type == "" || nc.Type == v1alpha.VNET {
131+
meta.primaryIPAddresses[nc.PrimaryIP] = struct{}{}
132+
}
133+
134+
if nc.Type == v1alpha.VNETBlock {
135+
primaryPrefix, err := netip.ParsePrefix(nc.PrimaryIP)
136+
if err != nil {
137+
return errors.Wrapf(err, "unable to parse ip prefix: %s", nc.PrimaryIP)
138+
}
139+
meta.primaryIPAddresses[primaryPrefix.Addr().String()] = struct{}{}
140+
}
113141
}
114-
meta.primaryIPAddresses[primaryPrefix.Addr().String()] = struct{}{}
142+
state.requestedIPs = nnc.Spec.RequestedIPCount
143+
meta.batch = nnc.Status.Scaler.BatchSize
144+
meta.max = nnc.Status.Scaler.MaxIPCount
115145
}
116-
}
146+
return nil
147+
})
117148

118-
state := ipPoolState{
119-
secondaryIPs: int64(len(ips)),
120-
requestedIPs: nnc.Spec.RequestedIPCount,
121-
}
122-
for i := range ips {
123-
ip := ips[i]
124-
switch ip.GetState() {
125-
case types.Assigned:
126-
state.allocatedToPods++
127-
case types.Available:
128-
state.available++
129-
case types.PendingProgramming:
130-
state.pendingProgramming++
131-
case types.PendingRelease:
132-
state.pendingRelease++
149+
g.Go(func() error {
150+
// Try to fetch the IPConfigurations, if available.
151+
if o.ipSrc != nil {
152+
ips := o.ipSrc()
153+
state.secondaryIPs = int64(len(ips))
154+
for i := range ips {
155+
ip := ips[i]
156+
switch ip.GetState() {
157+
case types.Assigned:
158+
state.allocatedToPods++
159+
case types.Available:
160+
state.available++
161+
case types.PendingProgramming:
162+
state.pendingProgramming++
163+
case types.PendingRelease:
164+
state.pendingRelease++
165+
}
166+
}
133167
}
134-
}
168+
return nil
169+
})
170+
171+
err := g.Wait()
172+
135173
state.currentAvailableIPs = state.secondaryIPs - state.allocatedToPods - state.pendingRelease
136174
state.expectedAvailableIPs = state.requestedIPs - state.allocatedToPods
137175

176+
// Update the metrics.
138177
labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
139178
IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
140179
IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
@@ -153,5 +192,8 @@ func observeMetrics(ctx context.Context, ipcli func() map[string]cns.IPConfigura
153192
} else {
154193
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPNotExhausted))
155194
}
195+
if err != nil {
196+
return errors.Wrap(err, "failed to collect all metrics")
197+
}
156198
return nil
157199
}

cns/ipampool/v2/monitor.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ type Monitor struct {
4747
nncSource <-chan v1alpha.NodeNetworkConfig
4848
started chan interface{}
4949
once sync.Once
50-
legacyMetricsObserver func() error
50+
legacyMetricsObserver func(context.Context) error
5151
}
5252

5353
func NewMonitor(z *zap.Logger, store ipStateStore, nnccli nodeNetworkConfigSpecUpdater, demandSource <-chan int, nncSource <-chan v1alpha.NodeNetworkConfig, cssSource <-chan v1alpha1.ClusterSubnetState) *Monitor { //nolint:lll // it's fine
@@ -59,7 +59,7 @@ func NewMonitor(z *zap.Logger, store ipStateStore, nnccli nodeNetworkConfigSpecU
5959
cssSource: cssSource,
6060
nncSource: nncSource,
6161
started: make(chan interface{}),
62-
legacyMetricsObserver: func() error { return nil },
62+
legacyMetricsObserver: func(context.Context) error { return nil },
6363
}
6464
}
6565

@@ -100,7 +100,7 @@ func (pm *Monitor) Start(ctx context.Context) error {
100100
if err := pm.reconcile(ctx); err != nil {
101101
pm.z.Error("reconcile failed", zap.Error(err))
102102
}
103-
if err := pm.legacyMetricsObserver(); err != nil {
103+
if err := pm.legacyMetricsObserver(ctx); err != nil {
104104
pm.z.Error("legacy metrics observer failed", zap.Error(err))
105105
}
106106
}
@@ -151,7 +151,7 @@ func (pm *Monitor) buildNNCSpec(request int64) v1alpha.NodeNetworkConfigSpec {
151151
return spec
152152
}
153153

154-
func (pm *Monitor) WithLegacyMetricsObserver(observer func() error) {
154+
func (pm *Monitor) WithLegacyMetricsObserver(observer func(context.Context) error) {
155155
pm.legacyMetricsObserver = observer
156156
}
157157

cns/service/main.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ func startTelemetryService(ctx context.Context) {
483483
log.Errorf("Telemetry service failed to start: %w", err)
484484
return
485485
}
486-
tb.PushData(rootCtx)
486+
tb.PushData(ctx)
487487
}
488488

489489
// Main is the entry point for CNS.
@@ -1349,6 +1349,14 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
13491349
}
13501350
}
13511351

1352+
if cnsconfig.EnableSubnetScarcity {
1353+
cacheOpts.ByObject[&cssv1alpha1.ClusterSubnetState{}] = cache.ByObject{
1354+
Namespaces: map[string]cache.Config{
1355+
"kube-system": {},
1356+
},
1357+
}
1358+
}
1359+
13521360
managerOpts := ctrlmgr.Options{
13531361
Scheme: scheme,
13541362
Metrics: ctrlmetrics.Options{BindAddress: "0"},
@@ -1374,9 +1382,13 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
13741382
cssCh := make(chan cssv1alpha1.ClusterSubnetState)
13751383
ipDemandCh := make(chan int)
13761384
if cnsconfig.EnableIPAMv2 {
1385+
cssSrc := func(context.Context) ([]cssv1alpha1.ClusterSubnetState, error) { return nil, nil }
1386+
if cnsconfig.EnableSubnetScarcity {
1387+
cssSrc = clustersubnetstate.NewClient(manager.GetClient()).List
1388+
}
13771389
nncCh := make(chan v1alpha.NodeNetworkConfig)
13781390
pmv2 := ipampoolv2.NewMonitor(z, httpRestServiceImplementation, cachedscopedcli, ipDemandCh, nncCh, cssCh)
1379-
obs := metrics.NewLegacyMetricsObserver(ctx, httpRestService.GetPodIPConfigState, cachedscopedcli.Get, clustersubnetstate.NewClient(manager.GetClient()).List)
1391+
obs := metrics.NewLegacyMetricsObserver(httpRestService.GetPodIPConfigState, cachedscopedcli.Get, cssSrc)
13801392
pmv2.WithLegacyMetricsObserver(obs)
13811393
poolMonitor = pmv2.AsV1(nncCh)
13821394
} else {
@@ -1462,13 +1474,14 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
14621474
// wait for the Reconciler to run once on a NNC that was made for this Node.
14631475
// the nncReadyCtx has a timeout of 15 minutes, after which we will consider
14641476
// this false and the NNC Reconciler stuck/failed, log and retry.
1465-
nncReadyCtx, _ := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak
1477+
nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak
14661478
if started, err := nncReconciler.Started(nncReadyCtx); !started {
14671479
log.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
14681480
nncReconcilerStartFailures.Inc()
14691481
continue
14701482
}
14711483
logger.Printf("NodeNetworkConfig reconciler has started.")
1484+
cancel()
14721485
break
14731486
}
14741487

crd/clustersubnetstate/client.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,6 @@ func (c *Client) Get(ctx context.Context, key types.NamespacedName) (*v1alpha1.C
106106

107107
func (c *Client) List(ctx context.Context) ([]v1alpha1.ClusterSubnetState, error) {
108108
clusterSubnetStateList := &v1alpha1.ClusterSubnetStateList{}
109-
err := c.cli.List(ctx, clusterSubnetStateList)
109+
err := c.cli.List(ctx, clusterSubnetStateList, client.InNamespace("kube-system"))
110110
return clusterSubnetStateList.Items, errors.Wrap(err, "failed to list css")
111111
}

0 commit comments

Comments
 (0)