Skip to content

Commit c8dce11

Browse files
committed
replaced-tsdb-append-error-metrics
Signed-off-by: amanycodes <amanycodes@gmail.com>
1 parent 3389cdf commit c8dce11

File tree

6 files changed

+120
-86
lines changed

6 files changed

+120
-86
lines changed

tsdb/block_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ func createHeadWithOOOSamples(tb testing.TB, w *wlog.WL, series []storage.Series
738738
require.NoError(tb, app.Commit())
739739

740740
oooSamplesAppended := 0
741-
require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended))
741+
require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.successulSamplesAppended.WithLabelValues(oooAppends)))
742742

743743
app = head.Appender(context.Background())
744744
for i, lset := range oooSampleLabels {
@@ -751,11 +751,11 @@ func createHeadWithOOOSamples(tb testing.TB, w *wlog.WL, series []storage.Series
751751
}
752752
require.NoError(tb, app.Commit())
753753

754-
actOOOAppended := prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended)
754+
actOOOAppended := prom_testutil.ToFloat64(head.metrics.successulSamplesAppended.WithLabelValues(oooAppends))
755755
require.GreaterOrEqual(tb, actOOOAppended, float64(oooSamplesAppended-len(series)))
756756
require.LessOrEqual(tb, actOOOAppended, float64(oooSamplesAppended))
757757

758-
require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.samplesAppended))
758+
require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.successulSamplesAppended.WithLabelValues(successfulAppends)))
759759

760760
return head
761761
}

tsdb/db_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6897,7 +6897,7 @@ func testOOODisabled(t *testing.T, scenario sampleTypeScenario) {
68976897
requireEqualSeries(t, expSamples, seriesSet, true)
68986898
requireEqualOOOSamples(t, 0, db)
68996899
require.Equal(t, float64(failedSamples),
6900-
prom_testutil.ToFloat64(db.head.metrics.outOfOrderSamples.WithLabelValues(scenario.sampleType))+prom_testutil.ToFloat64(db.head.metrics.outOfBoundSamples.WithLabelValues(scenario.sampleType)),
6900+
prom_testutil.ToFloat64(db.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, scenario.sampleType))+prom_testutil.ToFloat64(db.head.metrics.sampleAppendFailures.WithLabelValues(outOfBounds, scenario.sampleType)),
69016901
"number of ooo/oob samples mismatch")
69026902

69036903
// Verifying that no OOO artifacts were generated.

tsdb/head.go

Lines changed: 69 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -342,38 +342,45 @@ func (h *Head) resetInMemoryState() error {
342342
}
343343

344344
type headMetrics struct {
345-
activeAppenders prometheus.Gauge
346-
series prometheus.GaugeFunc
347-
seriesCreated prometheus.Counter
348-
seriesRemoved prometheus.Counter
349-
seriesNotFound prometheus.Counter
350-
chunks prometheus.Gauge
351-
chunksCreated prometheus.Counter
352-
chunksRemoved prometheus.Counter
353-
gcDuration prometheus.Summary
354-
samplesAppended *prometheus.CounterVec
355-
outOfOrderSamplesAppended *prometheus.CounterVec
356-
outOfBoundSamples *prometheus.CounterVec
357-
outOfOrderSamples *prometheus.CounterVec
358-
tooOldSamples *prometheus.CounterVec
359-
walTruncateDuration prometheus.Summary
360-
walCorruptionsTotal prometheus.Counter
361-
dataTotalReplayDuration prometheus.Gauge
362-
headTruncateFail prometheus.Counter
363-
headTruncateTotal prometheus.Counter
364-
checkpointDeleteFail prometheus.Counter
365-
checkpointDeleteTotal prometheus.Counter
366-
checkpointCreationFail prometheus.Counter
367-
checkpointCreationTotal prometheus.Counter
368-
mmapChunkCorruptionTotal prometheus.Counter
369-
snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
370-
oooHistogram prometheus.Histogram
371-
mmapChunksTotal prometheus.Counter
345+
activeAppenders prometheus.Gauge
346+
series prometheus.GaugeFunc
347+
seriesCreated prometheus.Counter
348+
seriesRemoved prometheus.Counter
349+
seriesNotFound prometheus.Counter
350+
chunks prometheus.Gauge
351+
chunksCreated prometheus.Counter
352+
chunksRemoved prometheus.Counter
353+
gcDuration prometheus.Summary
354+
successulSamplesAppended *prometheus.CounterVec
355+
// samplesAppended *prometheus.CounterVec
356+
// outOfOrderSamplesAppended *prometheus.CounterVec
357+
// outOfBoundSamples *prometheus.CounterVec
358+
// outOfOrderSamples *prometheus.CounterVec
359+
// tooOldSamples *prometheus.CounterVec
360+
walTruncateDuration prometheus.Summary
361+
walCorruptionsTotal prometheus.Counter
362+
dataTotalReplayDuration prometheus.Gauge
363+
headTruncateFail prometheus.Counter
364+
headTruncateTotal prometheus.Counter
365+
checkpointDeleteFail prometheus.Counter
366+
checkpointDeleteTotal prometheus.Counter
367+
checkpointCreationFail prometheus.Counter
368+
checkpointCreationTotal prometheus.Counter
369+
mmapChunkCorruptionTotal prometheus.Counter
370+
snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
371+
oooHistogram prometheus.Histogram
372+
mmapChunksTotal prometheus.Counter
373+
sampleAppendFailures *prometheus.CounterVec
372374
}
373375

374376
const (
375377
sampleMetricTypeFloat = "float"
376378
sampleMetricTypeHistogram = "histogram"
379+
outOfBounds = "out_of_bounds"
380+
outOfOrder = "out_of_order"
381+
tooOld = "too_old"
382+
successfulAppends = "successful_appends"
383+
oooAppends = "ooo_appends"
377384
)
378385

379386
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@@ -428,26 +435,34 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
428435
Name: "prometheus_tsdb_data_replay_duration_seconds",
429436
Help: "Time taken to replay the data on disk.",
430437
}),
431-
samplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
432-
Name: "prometheus_tsdb_head_samples_appended_total",
433-
Help: "Total number of appended samples.",
434-
}, []string{"type"}),
435-
outOfOrderSamplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
436-
Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
437-
Help: "Total number of appended out of order samples.",
438-
}, []string{"type"}),
439-
outOfBoundSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
440-
Name: "prometheus_tsdb_out_of_bound_samples_total",
441-
Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
442-
}, []string{"type"}),
443-
outOfOrderSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
444-
Name: "prometheus_tsdb_out_of_order_samples_total",
445-
Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
446-
}, []string{"type"}),
447-
tooOldSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
448-
Name: "prometheus_tsdb_too_old_samples_total",
449-
Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
450-
}, []string{"type"}),
438+
successulSamplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
439+
Name: "prometheus_tsdb_head_sucessful_samples_appended_total",
440+
Help: "Total number of successful appended samples including out of order samples.",
441+
}, []string{"reason", "type"}),
442+
sampleAppendFailures: prometheus.NewCounterVec(prometheus.CounterOpts{
443+
Name: "prometheus_tsdb_head_samples_append_failures_total",
444+
Help: "Total number of sample append failures with different reasons.",
445+
}, []string{"reason", "type"}),
446+
// samplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
447+
// Name: "prometheus_tsdb_head_samples_appended_total",
448+
// Help: "Total number of appended samples.",
449+
// }, []string{"type"}),
450+
// outOfOrderSamplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
451+
// Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
452+
// Help: "Total number of appended out of order samples.",
453+
// }, []string{"type"}),
454+
// outOfBoundSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
455+
// Name: "prometheus_tsdb_out_of_bound_samples_total",
456+
// Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
457+
// }, []string{"type"}),
458+
// outOfOrderSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
459+
// Name: "prometheus_tsdb_out_of_order_samples_total",
460+
// Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
461+
// }, []string{"type"}),
462+
// tooOldSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
463+
// Name: "prometheus_tsdb_too_old_samples_total",
464+
// Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
465+
// }, []string{"type"}),
451466
headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
452467
Name: "prometheus_tsdb_head_truncations_failed_total",
453468
Help: "Total number of head truncations that failed.",
@@ -516,11 +531,13 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
516531
m.walTruncateDuration,
517532
m.walCorruptionsTotal,
518533
m.dataTotalReplayDuration,
519-
m.samplesAppended,
520-
m.outOfOrderSamplesAppended,
521-
m.outOfBoundSamples,
522-
m.outOfOrderSamples,
523-
m.tooOldSamples,
534+
// m.samplesAppended,
535+
m.successulSamplesAppended,
536+
// m.outOfOrderSamplesAppended,
537+
m.sampleAppendFailures,
538+
// m.outOfBoundSamples,
539+
// m.outOfOrderSamples,
540+
// m.tooOldSamples,
524541
m.headTruncateFail,
525542
m.headTruncateTotal,
526543
m.checkpointDeleteFail,

tsdb/head_append.go

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,8 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
343343
// Fail fast if OOO is disabled and the sample is out of bounds.
344344
// Otherwise a full check will be done later to decide if the sample is in-order or out-of-order.
345345
if a.oooTimeWindow == 0 && t < a.minValidTime {
346-
a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
346+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfBounds, sampleMetricTypeFloat).Inc()
347+
// a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
347348
return 0, storage.ErrOutOfBounds
348349
}
349350

@@ -377,7 +378,8 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
377378
isOOO, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow)
378379
if err == nil {
379380
if isOOO && a.hints != nil && a.hints.DiscardOutOfOrder {
380-
a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
381+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, sampleMetricTypeFloat).Inc()
382+
// a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
381383
return 0, storage.ErrOutOfOrderSample
382384
}
383385
s.pendingCommit = true
@@ -388,9 +390,11 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
388390
if err != nil {
389391
switch {
390392
case errors.Is(err, storage.ErrOutOfOrderSample):
391-
a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
393+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, sampleMetricTypeFloat).Inc()
394+
// a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
392395
case errors.Is(err, storage.ErrTooOldSample):
393-
a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
396+
a.head.metrics.sampleAppendFailures.WithLabelValues(tooOld, sampleMetricTypeFloat).Inc()
397+
// a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
394398
}
395399
return 0, err
396400
}
@@ -655,7 +659,8 @@ func (a *headAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels
655659
// Fail fast if OOO is disabled and the sample is out of bounds.
656660
// Otherwise a full check will be done later to decide if the sample is in-order or out-of-order.
657661
if (a.oooTimeWindow == 0 || !a.head.opts.EnableOOONativeHistograms.Load()) && t < a.minValidTime {
658-
a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
662+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfBounds, sampleMetricTypeHistogram).Inc()
663+
// a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
659664
return 0, storage.ErrOutOfBounds
660665
}
661666

@@ -707,9 +712,11 @@ func (a *headAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels
707712
case errors.Is(err, storage.ErrOutOfOrderSample):
708713
fallthrough
709714
case errors.Is(err, storage.ErrOOONativeHistogramsDisabled):
710-
a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
715+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, sampleMetricTypeHistogram).Inc()
716+
// a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
711717
case errors.Is(err, storage.ErrTooOldSample):
712-
a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
718+
a.head.metrics.sampleAppendFailures.WithLabelValues(tooOld, sampleMetricTypeHistogram).Inc()
719+
// a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
713720
}
714721
return 0, err
715722
}
@@ -744,9 +751,11 @@ func (a *headAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels
744751
case errors.Is(err, storage.ErrOutOfOrderSample):
745752
fallthrough
746753
case errors.Is(err, storage.ErrOOONativeHistogramsDisabled):
747-
a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
754+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, sampleMetricTypeHistogram).Inc()
755+
// a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
748756
case errors.Is(err, storage.ErrTooOldSample):
749-
a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
757+
// a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
758+
a.head.metrics.sampleAppendFailures.WithLabelValues(tooOld, sampleMetricTypeHistogram).Inc()
750759
}
751760
return 0, err
752761
}
@@ -1491,14 +1500,22 @@ func (a *headAppender) Commit() (err error) {
14911500
a.commitFloatHistograms(acc)
14921501
a.commitMetadata()
14931502

1494-
a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatOOORejected))
1495-
a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Add(float64(acc.histoOOORejected))
1496-
a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatOOBRejected))
1497-
a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatTooOldRejected))
1498-
a.head.metrics.samplesAppended.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatsAppended))
1499-
a.head.metrics.samplesAppended.WithLabelValues(sampleMetricTypeHistogram).Add(float64(acc.histogramsAppended))
1500-
a.head.metrics.outOfOrderSamplesAppended.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.oooFloatsAccepted))
1501-
a.head.metrics.outOfOrderSamplesAppended.WithLabelValues(sampleMetricTypeHistogram).Add(float64(acc.oooHistogramAccepted))
1503+
// a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatOOORejected))
1504+
// a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Add(float64(acc.histoOOORejected))
1505+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, sampleMetricTypeHistogram).Add(float64(acc.histoOOORejected))
1506+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfOrder, sampleMetricTypeFloat).Add(float64(acc.floatOOORejected))
1507+
// a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatOOBRejected))
1508+
a.head.metrics.sampleAppendFailures.WithLabelValues(outOfBounds, sampleMetricTypeFloat).Add(float64(acc.floatOOBRejected))
1509+
// a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatTooOldRejected))
1510+
a.head.metrics.sampleAppendFailures.WithLabelValues(tooOld, sampleMetricTypeFloat).Add(float64(acc.floatTooOldRejected))
1511+
a.head.metrics.successulSamplesAppended.WithLabelValues(successfulAppends, sampleMetricTypeFloat).Add(float64(acc.floatsAppended))
1512+
a.head.metrics.successulSamplesAppended.WithLabelValues(successfulAppends, sampleMetricTypeHistogram).Add(float64(acc.floatsAppended))
1513+
// a.head.metrics.samplesAppended.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.floatsAppended))
1514+
// a.head.metrics.samplesAppended.WithLabelValues(sampleMetricTypeHistogram).Add(float64(acc.histogramsAppended))
1515+
a.head.metrics.successulSamplesAppended.WithLabelValues(oooAppends, sampleMetricTypeFloat).Add(float64(acc.oooFloatsAccepted))
1516+
a.head.metrics.successulSamplesAppended.WithLabelValues(oooAppends, sampleMetricTypeHistogram).Add(float64(acc.oooHistogramAccepted))
1517+
// a.head.metrics.outOfOrderSamplesAppended.WithLabelValues(sampleMetricTypeFloat).Add(float64(acc.oooFloatsAccepted))
1518+
// a.head.metrics.outOfOrderSamplesAppended.WithLabelValues(sampleMetricTypeHistogram).Add(float64(acc.oooHistogramAccepted))
15021519
a.head.updateMinMaxTime(acc.inOrderMint, acc.inOrderMaxt)
15031520
a.head.updateMinOOOMaxOOOTime(acc.oooMinT, acc.oooMaxT)
15041521

0 commit comments

Comments
 (0)