Skip to content

Commit 6252d73

Browse files
feat(metrics): add type description and schedule name to labels (#132)
1 parent 7c9edb5 commit 6252d73

File tree

1 file changed

+62
-16
lines changed

1 file changed

+62
-16
lines changed

internal/metrics/metrics.go

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ func (s *MetricsRegistryImpl) IncOperationsStartedCounter(operation types.Operat
109109
operation.GetContainerID(),
110110
operation.GetDatabaseName(),
111111
operation.GetType().String(),
112+
operation.GetTypeDescription(),
112113
label,
113114
).Inc()
114115
}
@@ -131,6 +132,7 @@ func (s *MetricsRegistryImpl) ReportOperationInflight(operation types.Operation)
131132
operation.GetContainerID(),
132133
operation.GetDatabaseName(),
133134
operation.GetType().String(),
135+
operation.GetTypeDescription(),
134136
operation.GetState().String(),
135137
label,
136138
).Inc()
@@ -141,6 +143,7 @@ func (s *MetricsRegistryImpl) ReportOperationInflight(operation types.Operation)
141143
operation.GetContainerID(),
142144
operation.GetDatabaseName(),
143145
operation.GetType().String(),
146+
operation.GetTypeDescription(),
144147
operation.GetState().String(),
145148
).Observe(duration.Seconds())
146149
}
@@ -154,6 +157,7 @@ func (s *MetricsRegistryImpl) ReportOperationMetrics(operation types.Operation)
154157
operation.GetContainerID(),
155158
operation.GetDatabaseName(),
156159
operation.GetType().String(),
160+
operation.GetTypeDescription(),
157161
operation.GetState().String(),
158162
).Observe(duration.Seconds())
159163
}
@@ -167,7 +171,12 @@ func (s *MetricsRegistryImpl) ReportOperationMetrics(operation types.Operation)
167171
}
168172

169173
s.operationsFinished.WithLabelValues(
170-
operation.GetContainerID(), operation.GetDatabaseName(), operation.GetType().String(), operation.GetState().String(), label,
174+
operation.GetContainerID(),
175+
operation.GetDatabaseName(),
176+
operation.GetType().String(),
177+
operation.GetTypeDescription(),
178+
operation.GetState().String(),
179+
label,
171180
).Inc()
172181

173182
}
@@ -203,25 +212,62 @@ func (s *MetricsRegistryImpl) IncCompletedBackupsCount(containerId string, datab
203212
}
204213

205214
func (s *MetricsRegistryImpl) IncScheduleCounters(schedule *types.BackupSchedule, err error) {
215+
var scheduleNameLabel string
216+
if schedule.Name != nil {
217+
scheduleNameLabel = *schedule.Name
218+
} else {
219+
scheduleNameLabel = ""
220+
}
221+
206222
if err != nil {
207-
s.scheduleActionFailedCount.WithLabelValues(schedule.ContainerID, schedule.DatabaseName, schedule.ID).Inc()
223+
s.scheduleActionFailedCount.WithLabelValues(
224+
schedule.ContainerID,
225+
schedule.DatabaseName,
226+
schedule.ID,
227+
scheduleNameLabel,
228+
).Inc()
208229
} else {
209-
s.scheduleActionSucceededCount.WithLabelValues(schedule.ContainerID, schedule.DatabaseName, schedule.ID).Inc()
230+
s.scheduleActionSucceededCount.WithLabelValues(
231+
schedule.ContainerID,
232+
schedule.DatabaseName,
233+
schedule.ID,
234+
scheduleNameLabel,
235+
).Inc()
210236
}
211237
if schedule.RecoveryPoint != nil {
212-
s.scheduleLastBackupTimestamp.WithLabelValues(schedule.ContainerID, schedule.DatabaseName, schedule.ID).Set(float64(schedule.RecoveryPoint.Unix()))
238+
s.scheduleLastBackupTimestamp.WithLabelValues(
239+
schedule.ContainerID,
240+
schedule.DatabaseName,
241+
schedule.ID,
242+
scheduleNameLabel,
243+
).Set(float64(schedule.RecoveryPoint.Unix()))
213244
} else if schedule.Audit != nil && schedule.Audit.CreatedAt != nil {
214245
// Report schedule creation time as last backup time if no backups were made
215-
s.scheduleLastBackupTimestamp.WithLabelValues(schedule.ContainerID, schedule.DatabaseName, schedule.ID).Set(float64(schedule.Audit.CreatedAt.AsTime().Unix()))
246+
s.scheduleLastBackupTimestamp.WithLabelValues(
247+
schedule.ContainerID,
248+
schedule.DatabaseName,
249+
schedule.ID,
250+
scheduleNameLabel,
251+
).Set(float64(schedule.Audit.CreatedAt.AsTime().Unix()))
216252
}
217253
info := schedule.GetBackupInfo(s.clock)
218254
if info != nil {
219-
s.scheduleRPOMarginRatio.WithLabelValues(schedule.ContainerID, schedule.DatabaseName, schedule.ID).Set(info.LastBackupRpoMarginRatio)
255+
s.scheduleRPOMarginRatio.WithLabelValues(
256+
schedule.ContainerID,
257+
schedule.DatabaseName,
258+
schedule.ID,
259+
scheduleNameLabel,
260+
).Set(info.LastBackupRpoMarginRatio)
220261
} else if schedule.Audit != nil && schedule.Audit.CreatedAt != nil && schedule.ScheduleSettings.RecoveryPointObjective != nil {
221262
// Report fake LastBackupRpoMarginRatio based on schedule creation time if no backups were made
222263
fakeRpoMargin := s.clock.Since(schedule.Audit.CreatedAt.AsTime())
223264
fakeLastBackupRpoMarginRatio := fakeRpoMargin.Seconds() / float64(schedule.ScheduleSettings.RecoveryPointObjective.Seconds)
224-
s.scheduleRPOMarginRatio.WithLabelValues(schedule.ContainerID, schedule.DatabaseName, schedule.ID).Set(fakeLastBackupRpoMarginRatio)
265+
s.scheduleRPOMarginRatio.WithLabelValues(
266+
schedule.ContainerID,
267+
schedule.DatabaseName,
268+
schedule.ID,
269+
scheduleNameLabel,
270+
).Set(fakeLastBackupRpoMarginRatio)
225271
}
226272
}
227273

@@ -265,32 +311,32 @@ func newMetricsRegistry(ctx context.Context, wg *sync.WaitGroup, cfg *config.Met
265311
Name: "duration_seconds",
266312
Help: "Duration of completed operations in seconds",
267313
Buckets: prometheus.ExponentialBuckets(10, 2, 8),
268-
}, []string{"container_id", "database", "type", "status"})
314+
}, []string{"container_id", "database", "type", "type_description", "status"})
269315

270316
s.inflightOperationsDuration = promauto.With(s.reg).NewHistogramVec(prometheus.HistogramOpts{
271317
Subsystem: "operations",
272318
Name: "inflight_duration_seconds",
273319
Help: "Duration of running operations in seconds",
274320
Buckets: prometheus.ExponentialBuckets(10, 2, 8),
275-
}, []string{"container_id", "database", "type", "state"})
321+
}, []string{"container_id", "database", "type", "type_description", "state"})
276322

277323
s.operationsStarted = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{
278324
Subsystem: "operations",
279325
Name: "started_counter",
280326
Help: "Total count of started operations",
281-
}, []string{"container_id", "database", "type", "schedule_id"})
327+
}, []string{"container_id", "database", "type", "type_description", "schedule_id"})
282328

283329
s.operationsFinished = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{
284330
Subsystem: "operations",
285331
Name: "finished_counter",
286332
Help: "Total count of finished operations",
287-
}, []string{"container_id", "database", "type", "status", "schedule_id"})
333+
}, []string{"container_id", "database", "type", "type_description", "status", "schedule_id"})
288334

289335
s.operationsInflight = promauto.With(s.reg).NewGaugeVec(prometheus.GaugeOpts{
290336
Subsystem: "operations",
291337
Name: "inflight",
292338
Help: "Total count of active operations",
293-
}, []string{"container_id", "database", "type", "status", "schedule_id"})
339+
}, []string{"container_id", "database", "type", "type_description", "status", "schedule_id"})
294340

295341
s.handlerRunsCount = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{
296342
Subsystem: "operation_processor",
@@ -326,25 +372,25 @@ func newMetricsRegistry(ctx context.Context, wg *sync.WaitGroup, cfg *config.Met
326372
Subsystem: "schedules",
327373
Name: "failed_count",
328374
Help: "Total count of failed scheduled backup runs",
329-
}, []string{"container_id", "database", "schedule_id"})
375+
}, []string{"container_id", "database", "schedule_id", "schedule_name"})
330376

331377
s.scheduleActionSucceededCount = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{
332378
Subsystem: "schedules",
333379
Name: "succeeded_count",
334380
Help: "Total count of successful scheduled backup runs",
335-
}, []string{"container_id", "database", "schedule_id"})
381+
}, []string{"container_id", "database", "schedule_id", "schedule_name"})
336382

337383
s.scheduleLastBackupTimestamp = promauto.With(s.reg).NewGaugeVec(prometheus.GaugeOpts{
338384
Subsystem: "schedules",
339385
Name: "last_backup_timestamp",
340386
Help: "Timestamp of last successful backup for this schedule",
341-
}, []string{"container_id", "database", "schedule_id"})
387+
}, []string{"container_id", "database", "schedule_id", "schedule_name"})
342388

343389
s.scheduleRPOMarginRatio = promauto.With(s.reg).NewGaugeVec(prometheus.GaugeOpts{
344390
Subsystem: "schedules",
345391
Name: "rpo_margin_ratio",
346392
Help: "if RPO is set for schedule, calculates a ratio to which RPO is satisfied",
347-
}, []string{"container_id", "database", "schedule_id"})
393+
}, []string{"container_id", "database", "schedule_id", "schedule_name"})
348394

349395
mux := http.NewServeMux()
350396
mux.Handle("/metrics", promhttp.HandlerFor(s.reg, promhttp.HandlerOpts{Registry: s.reg}))

0 commit comments

Comments
 (0)