Skip to content

Commit d9079f5

Browse files
authored
Add tsdb metrics to track unknown sereis references during wal/wbl replaying (#6945)
Signed-off-by: SungJin1212 <[email protected]>
1 parent a13cd69 commit d9079f5

File tree

3 files changed

+62
-0
lines changed

3 files changed

+62
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
* [FEATURE] Querier: Allow choosing PromQL engine via header. #6777
2323
* [FEATURE] Querier: Support for configuring query optimizers and enabling XFunctions in the Thanos engine. #6873
2424
* [FEATURE] Query Frontend: Add support /api/v1/format_query API for formatting queries. #6893
25+
* [ENHANCEMENT] Ingester: Add `cortex_ingester_tsdb_wal_replay_unknown_refs_total` and `cortex_ingester_tsdb_wbl_replay_unknown_refs_total` metrics to track unknown series references during wal/wbl replaying. #6945
2526
* [ENHANCEMENT] Ruler: Emit an error message when the rule synchronization fails. #6902
2627
* [ENHANCEMENT] Querier: Support snappy and zstd response compression for `-querier.response-compression` flag. #6848
2728
* [ENHANCEMENT] Tenant Federation: Add a # of query result limit logic when the `-tenant-federation.regex-matcher-enabled` is enabled. #6845

pkg/ingester/metrics.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ const (
2020
const (
2121
sampleMetricTypeFloat = "float"
2222
sampleMetricTypeHistogram = "histogram"
23+
24+
typeSeries = "series"
25+
typeSamples = "samples"
26+
typeExemplars = "exemplars"
27+
typeHistograms = "histograms"
28+
typeMetadata = "metadata"
29+
typeTombstones = "tombstones"
2330
)
2431

2532
type ingesterMetrics struct {
@@ -330,6 +337,8 @@ type tsdbMetrics struct {
330337
tsdbWALTruncateTotal *prometheus.Desc
331338
tsdbWALTruncateDuration *prometheus.Desc
332339
tsdbWALCorruptionsTotal *prometheus.Desc
340+
tsdbWALReplayUnknownRefsTotal *prometheus.Desc
341+
tsdbWBLReplayUnknownRefsTotal *prometheus.Desc
333342
tsdbWALWritesFailed *prometheus.Desc
334343
tsdbHeadTruncateFail *prometheus.Desc
335344
tsdbHeadTruncateTotal *prometheus.Desc
@@ -437,6 +446,14 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
437446
"cortex_ingester_tsdb_wal_corruptions_total",
438447
"Total number of TSDB WAL corruptions.",
439448
nil, nil),
449+
tsdbWALReplayUnknownRefsTotal: prometheus.NewDesc(
450+
"cortex_ingester_tsdb_wal_replay_unknown_refs_total",
451+
"Total number of unknown series references encountered during TSDB WAL replay.",
452+
[]string{"type"}, nil),
453+
tsdbWBLReplayUnknownRefsTotal: prometheus.NewDesc(
454+
"cortex_ingester_tsdb_wbl_replay_unknown_refs_total",
455+
"Total number of unknown series references encountered during TSDB WBL replay.",
456+
[]string{"type"}, nil),
440457
tsdbWALWritesFailed: prometheus.NewDesc(
441458
"cortex_ingester_tsdb_wal_writes_failed_total",
442459
"Total number of TSDB WAL writes that failed.",
@@ -601,6 +618,8 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
601618
out <- sm.tsdbWALTruncateTotal
602619
out <- sm.tsdbWALTruncateDuration
603620
out <- sm.tsdbWALCorruptionsTotal
621+
out <- sm.tsdbWALReplayUnknownRefsTotal
622+
out <- sm.tsdbWBLReplayUnknownRefsTotal
604623
out <- sm.tsdbWALWritesFailed
605624
out <- sm.tsdbHeadTruncateFail
606625
out <- sm.tsdbHeadTruncateTotal
@@ -659,6 +678,8 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
659678
data.SendSumOfCounters(out, sm.tsdbWALTruncateTotal, "prometheus_tsdb_wal_truncations_total")
660679
data.SendSumOfSummaries(out, sm.tsdbWALTruncateDuration, "prometheus_tsdb_wal_truncate_duration_seconds")
661680
data.SendSumOfCounters(out, sm.tsdbWALCorruptionsTotal, "prometheus_tsdb_wal_corruptions_total")
681+
data.SendSumOfCountersWithLabels(out, sm.tsdbWALReplayUnknownRefsTotal, "prometheus_tsdb_wal_replay_unknown_refs_total", "type")
682+
data.SendSumOfCountersWithLabels(out, sm.tsdbWBLReplayUnknownRefsTotal, "prometheus_tsdb_wbl_replay_unknown_refs_total", "type")
662683
data.SendSumOfCounters(out, sm.tsdbWALWritesFailed, "prometheus_tsdb_wal_writes_failed_total")
663684
data.SendSumOfCounters(out, sm.tsdbHeadTruncateFail, "prometheus_tsdb_head_truncations_failed_total")
664685
data.SendSumOfCounters(out, sm.tsdbHeadTruncateTotal, "prometheus_tsdb_head_truncations_total")

pkg/ingester/metrics_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,18 @@ func TestTSDBMetrics(t *testing.T) {
240240
# TYPE cortex_ingester_tsdb_wal_corruptions_total counter
241241
cortex_ingester_tsdb_wal_corruptions_total 2.676537e+06
242242
243+
# HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WAL replay.
244+
# TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter
245+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series"} 300
246+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples"} 303
247+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="metadata"} 306
248+
249+
# HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WBL replay.
250+
# TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter
251+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars"} 300
252+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="histograms"} 303
253+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="tombstones"} 306
254+
243255
# HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed.
244256
# TYPE cortex_ingester_tsdb_wal_writes_failed_total counter
245257
cortex_ingester_tsdb_wal_writes_failed_total 1486965
@@ -505,6 +517,18 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
505517
# TYPE cortex_ingester_tsdb_wal_corruptions_total counter
506518
cortex_ingester_tsdb_wal_corruptions_total 2.676537e+06
507519
520+
# HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WAL replay.
521+
# TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter
522+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series"} 300
523+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples"} 303
524+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="metadata"} 306
525+
526+
# HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WBL replay.
527+
# TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter
528+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars"} 300
529+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="histograms"} 303
530+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="tombstones"} 306
531+
508532
# HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed.
509533
# TYPE cortex_ingester_tsdb_wal_writes_failed_total counter
510534
cortex_ingester_tsdb_wal_writes_failed_total 1486965
@@ -883,6 +907,22 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
883907
})
884908
snapshotReplayErrorTotal.Add(103)
885909

910+
walReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
911+
Name: "prometheus_tsdb_wal_replay_unknown_refs_total",
912+
Help: "Total number of unknown series references encountered during WAL replay.",
913+
}, []string{"type"})
914+
walReplayUnknownRefsTotal.WithLabelValues(typeSeries).Add(100)
915+
walReplayUnknownRefsTotal.WithLabelValues(typeSamples).Add(101)
916+
walReplayUnknownRefsTotal.WithLabelValues(typeMetadata).Add(102)
917+
918+
wblReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
919+
Name: "prometheus_tsdb_wbl_replay_unknown_refs_total",
920+
Help: "Total number of unknown series references encountered during WBL replay.",
921+
}, []string{"type"})
922+
wblReplayUnknownRefsTotal.WithLabelValues(typeExemplars).Add(100)
923+
wblReplayUnknownRefsTotal.WithLabelValues(typeHistograms).Add(101)
924+
wblReplayUnknownRefsTotal.WithLabelValues(typeTombstones).Add(102)
925+
886926
oooHistogram := promauto.With(r).NewHistogram(prometheus.HistogramOpts{
887927
Name: "prometheus_tsdb_sample_ooo_delta",
888928
Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",

0 commit comments

Comments
 (0)