diff --git a/CHANGELOG.md b/CHANGELOG.md index 0851e0540c9..0b03617baa0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ * [FEATURE] Querier: Allow choosing PromQL engine via header. #6777 * [FEATURE] Querier: Support for configuring query optimizers and enabling XFunctions in the Thanos engine. #6873 * [FEATURE] Query Frontend: Add support /api/v1/format_query API for formatting queries. #6893 +* [ENHANCEMENT] Ingester: Add `cortex_ingester_tsdb_wal_replay_unknown_refs_total` and `cortex_ingester_tsdb_wbl_replay_unknown_refs_total` metrics to track unknown series references during wal/wbl replaying. #6945 * [ENHANCEMENT] Ruler: Emit an error message when the rule synchronization fails. #6902 * [ENHANCEMENT] Querier: Support snappy and zstd response compression for `-querier.response-compression` flag. #6848 * [ENHANCEMENT] Tenant Federation: Add a # of query result limit logic when the `-tenant-federation.regex-matcher-enabled` is enabled. #6845 diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index fc05b9764bb..8160216f2a1 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -20,6 +20,13 @@ const ( const ( sampleMetricTypeFloat = "float" sampleMetricTypeHistogram = "histogram" + + typeSeries = "series" + typeSamples = "samples" + typeExemplars = "exemplars" + typeHistograms = "histograms" + typeMetadata = "metadata" + typeTombstones = "tombstones" ) type ingesterMetrics struct { @@ -330,6 +337,8 @@ type tsdbMetrics struct { tsdbWALTruncateTotal *prometheus.Desc tsdbWALTruncateDuration *prometheus.Desc tsdbWALCorruptionsTotal *prometheus.Desc + tsdbWALReplayUnknownRefsTotal *prometheus.Desc + tsdbWBLReplayUnknownRefsTotal *prometheus.Desc tsdbWALWritesFailed *prometheus.Desc tsdbHeadTruncateFail *prometheus.Desc tsdbHeadTruncateTotal *prometheus.Desc @@ -437,6 +446,14 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics { "cortex_ingester_tsdb_wal_corruptions_total", "Total number of TSDB WAL corruptions.", nil, nil), + tsdbWALReplayUnknownRefsTotal: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_replay_unknown_refs_total", + "Total number of unknown series references encountered during TSDB WAL replay.", + []string{"type"}, nil), + tsdbWBLReplayUnknownRefsTotal: prometheus.NewDesc( + "cortex_ingester_tsdb_wbl_replay_unknown_refs_total", + "Total number of unknown series references encountered during TSDB WBL replay.", + []string{"type"}, nil), tsdbWALWritesFailed: prometheus.NewDesc( "cortex_ingester_tsdb_wal_writes_failed_total", "Total number of TSDB WAL writes that failed.", @@ -601,6 +618,8 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) { out <- sm.tsdbWALTruncateTotal out <- sm.tsdbWALTruncateDuration out <- sm.tsdbWALCorruptionsTotal + out <- sm.tsdbWALReplayUnknownRefsTotal + out <- sm.tsdbWBLReplayUnknownRefsTotal out <- sm.tsdbWALWritesFailed out <- sm.tsdbHeadTruncateFail out <- sm.tsdbHeadTruncateTotal @@ -659,6 +678,8 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCounters(out, sm.tsdbWALTruncateTotal, "prometheus_tsdb_wal_truncations_total") data.SendSumOfSummaries(out, sm.tsdbWALTruncateDuration, "prometheus_tsdb_wal_truncate_duration_seconds") data.SendSumOfCounters(out, sm.tsdbWALCorruptionsTotal, "prometheus_tsdb_wal_corruptions_total") + data.SendSumOfCountersWithLabels(out, sm.tsdbWALReplayUnknownRefsTotal, "prometheus_tsdb_wal_replay_unknown_refs_total", "type") + data.SendSumOfCountersWithLabels(out, sm.tsdbWBLReplayUnknownRefsTotal, "prometheus_tsdb_wbl_replay_unknown_refs_total", "type") data.SendSumOfCounters(out, sm.tsdbWALWritesFailed, "prometheus_tsdb_wal_writes_failed_total") data.SendSumOfCounters(out, sm.tsdbHeadTruncateFail, "prometheus_tsdb_head_truncations_failed_total") data.SendSumOfCounters(out, sm.tsdbHeadTruncateTotal, "prometheus_tsdb_head_truncations_total") diff --git a/pkg/ingester/metrics_test.go b/pkg/ingester/metrics_test.go index b08b0ca8141..9c7d316b964 100644 --- a/pkg/ingester/metrics_test.go +++ b/pkg/ingester/metrics_test.go @@ -240,6 +240,18 @@ func TestTSDBMetrics(t *testing.T) { # TYPE cortex_ingester_tsdb_wal_corruptions_total counter cortex_ingester_tsdb_wal_corruptions_total 2.676537e+06 + # HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WAL replay. + # TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series"} 300 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples"} 303 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="metadata"} 306 + + # HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WBL replay. + # TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars"} 300 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="histograms"} 303 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="tombstones"} 306 + # HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed. # TYPE cortex_ingester_tsdb_wal_writes_failed_total counter cortex_ingester_tsdb_wal_writes_failed_total 1486965 @@ -505,6 +517,18 @@ func TestTSDBMetricsWithRemoval(t *testing.T) { # TYPE cortex_ingester_tsdb_wal_corruptions_total counter cortex_ingester_tsdb_wal_corruptions_total 2.676537e+06 + # HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WAL replay. + # TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series"} 300 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples"} 303 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="metadata"} 306 + + # HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during TSDB WBL replay. + # TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars"} 300 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="histograms"} 303 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="tombstones"} 306 + # HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed. # TYPE cortex_ingester_tsdb_wal_writes_failed_total counter cortex_ingester_tsdb_wal_writes_failed_total 1486965 @@ -883,6 +907,22 @@ func populateTSDBMetrics(base float64) *prometheus.Registry { }) snapshotReplayErrorTotal.Add(103) + walReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_replay_unknown_refs_total", + Help: "Total number of unknown series references encountered during WAL replay.", + }, []string{"type"}) + walReplayUnknownRefsTotal.WithLabelValues(typeSeries).Add(100) + walReplayUnknownRefsTotal.WithLabelValues(typeSamples).Add(101) + walReplayUnknownRefsTotal.WithLabelValues(typeMetadata).Add(102) + + wblReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wbl_replay_unknown_refs_total", + Help: "Total number of unknown series references encountered during WBL replay.", + }, []string{"type"}) + wblReplayUnknownRefsTotal.WithLabelValues(typeExemplars).Add(100) + wblReplayUnknownRefsTotal.WithLabelValues(typeHistograms).Add(101) + wblReplayUnknownRefsTotal.WithLabelValues(typeTombstones).Add(102) + oooHistogram := promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Name: "prometheus_tsdb_sample_ooo_delta", Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",