Skip to content

Commit 8880c36

Browse files
authored
Fixes site_link_errors_total not counting errors (#2402)
Fixes #2378
1 parent acf2c90 commit 8880c36

File tree

2 files changed

+42
-26
lines changed

2 files changed

+42
-26
lines changed

cmd/network-observer/internal/collector/metrics/metrics.go

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -109,21 +109,15 @@ type counterMetricByItem struct {
109109
func (c *counterMetricByItem) Ensure(id string, ct int) {
110110
prev, ok := c.Items[id]
111111
if !ok {
112-
prev = ct
113-
c.Items[id] = prev
114-
}
115-
if ct <= prev {
112+
c.Items[id] = ct
116113
return
117114
}
118-
c.Items[id] = ct
119-
c.Counter.Add(float64(ct - prev))
120-
}
121-
122-
func (m *counterMetricByItem) Remove(id string) {
123-
if _, ok := m.Items[id]; !ok {
124-
return
115+
if ct > prev {
116+
c.Counter.Add(float64(ct - prev))
125117
}
126-
delete(m.Items, id)
118+
// When ct < prev the counter was reset (e.g. router restart). Accept
119+
// the new value as the baseline without incrementing.
120+
c.Items[id] = ct
127121
}
128122

129123
type gaugeMetricByID struct {
@@ -411,15 +405,6 @@ func (a *Adaptor) Remove(record vanflow.Record) {
411405
return
412406
}
413407
metric.Remove(record.ID)
414-
linkErrorKey := siteLinkErrors{
415-
SiteID: siteLink.SiteID,
416-
Role: siteLink.Role,
417-
}
418-
counter, ok := a.linkErrors[linkErrorKey]
419-
if !ok {
420-
return
421-
}
422-
counter.Remove(record.ID)
423408
case vanflow.ListenerRecord:
424409
site, wants, ok := a.listenerInfo(record)
425410
if !ok {

cmd/network-observer/internal/collector/metrics/metrics_test.go

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,14 @@ func TestSiteLinkErrorMetrics(t *testing.T) {
160160
reg := prometheus.NewPedanticRegistry()
161161
metrics := New(reg)
162162
interRouter := ptrTo("inter-router")
163+
statusUp := ptrTo("up")
163164
statusDown := ptrTo("down")
164165

165166
metrics.Add(vanflow.RouterRecord{
166167
BaseRecord: vanflow.NewBase("r01"), Parent: ptrTo("s01"), Mode: interRouter,
167168
})
169+
170+
// Initial Add sets the baseline. Counter stays at zero.
168171
metrics.Add(vanflow.LinkRecord{
169172
BaseRecord: vanflow.NewBase("l01"),
170173
Role: interRouter,
@@ -174,6 +177,8 @@ func TestSiteLinkErrorMetrics(t *testing.T) {
174177
})
175178
assert.Equal(t, prom_testutil.CollectAndCount(reg, "skupper_site_link_errors_total"), 1)
176179
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 0.0)
180+
181+
// Update increments DownCount 12 -> 15: counter += 3.
177182
metrics.Update(
178183
vanflow.LinkRecord{
179184
BaseRecord: vanflow.NewBase("l01"),
@@ -192,23 +197,51 @@ func TestSiteLinkErrorMetrics(t *testing.T) {
192197
)
193198
assert.Equal(t, prom_testutil.CollectAndCount(reg, "skupper_site_link_errors_total"), 1)
194199
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 3.0)
200+
201+
// Source reconnect with higher DownCount: the collector removes all
202+
// records from the source and re-adds them. The re-added record
203+
// carries the already-incremented DownCount. The counter must detect
204+
// the delta across the Remove+Add boundary.
195205
metrics.Remove(vanflow.LinkRecord{
196206
BaseRecord: vanflow.NewBase("l01"),
197207
Role: interRouter,
198208
Status: statusDown,
199209
Parent: ptrTo("r01"),
200210
DownCount: ptrTo(uint64(15)),
201211
})
202-
// cannot decrease
203-
assert.Equal(t, prom_testutil.CollectAndCount(reg, "skupper_site_link_errors_total"), 1)
212+
// Counter must not decrease after removal.
204213
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 3.0)
214+
metrics.Add(vanflow.LinkRecord{
215+
BaseRecord: vanflow.NewBase("l01"),
216+
Role: interRouter,
217+
Status: statusUp,
218+
Peer: ptrTo("ap02"),
219+
Parent: ptrTo("r01"),
220+
DownCount: ptrTo(uint64(17)),
221+
})
222+
// Delta of 2 (17 - 15): counter = 5.
223+
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 5.0)
224+
225+
// Router restart resets DownCount to zero. Remove+Add with a lower
226+
// DownCount resets the baseline without incrementing the counter.
227+
metrics.Remove(vanflow.LinkRecord{
228+
BaseRecord: vanflow.NewBase("l01"),
229+
Role: interRouter,
230+
Status: statusUp,
231+
Peer: ptrTo("ap02"),
232+
Parent: ptrTo("r01"),
233+
DownCount: ptrTo(uint64(17)),
234+
})
205235
metrics.Add(vanflow.LinkRecord{
206236
BaseRecord: vanflow.NewBase("l01"),
207237
Role: interRouter,
208238
Status: statusDown,
209239
Parent: ptrTo("r01"),
210240
DownCount: ptrTo(uint64(0)),
211241
})
242+
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 5.0)
243+
244+
// After reset, subsequent Updates still increment from the new baseline.
212245
metrics.Update(
213246
vanflow.LinkRecord{
214247
BaseRecord: vanflow.NewBase("l01"),
@@ -225,9 +258,7 @@ func TestSiteLinkErrorMetrics(t *testing.T) {
225258
DownCount: ptrTo(uint64(1)),
226259
},
227260
)
228-
// after delete, can still increment
229-
assert.Equal(t, prom_testutil.CollectAndCount(reg, "skupper_site_link_errors_total"), 1)
230-
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 4.0)
261+
assert.Equal(t, prom_testutil.ToFloat64(metrics.siteLinkErrors.WithLabelValues("s01", "inter-router")), 6.0)
231262
}
232263

233264
func ptrTo[T any](s T) *T {

0 commit comments

Comments
 (0)