Skip to content

Commit 3ae12a2

Browse files
authored
Centralize metrics used by compactor and add user label to compactor metrics (#6096)
* Centralize metrics used by compactor and add user label to compactor metrics. Signed-off-by: Alex Le <[email protected]> * Updated CHANGELOG Signed-off-by: Alex Le <[email protected]> * addressed comments Signed-off-by: Alex Le <[email protected]> * Added back missing metric Signed-off-by: Alex Le <[email protected]> --------- Signed-off-by: Alex Le <[email protected]>
1 parent 75cfab4 commit 3ae12a2

11 files changed

+596
-712
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
* [ENHANCEMENT] Ruler: Add support for filtering by `state` and `health` field on Rules API. #6040
2828
* [ENHANCEMENT] Ruler: Add support for filtering by `match` field on Rules API. #6083
2929
* [ENHANCEMENT] Distributor: Reduce memory usage when error volume is high. #6095
30+
* [ENHANCEMENT] Compactor: Centralize metrics used by compactor and add user label to compactor metrics. #6096
3031
* [ENHANCEMENT] Compactor: Add unique execution ID for each compaction cycle in log for easy debugging. #6097
3132
* [ENHANCEMENT] Ruler: Add support for filtering by `state` and `health` field on Rules API. #6040
3233
* [BUGFIX] Configsdb: Fix endline issue in db password. #5920

pkg/compactor/blocks_cleaner.go

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727

2828
const (
2929
defaultDeleteBlocksConcurrency = 16
30+
reasonValueRetention = "retention"
3031
)
3132

3233
type BlocksCleanerConfig struct {
@@ -56,15 +57,23 @@ type BlocksCleaner struct {
5657
runsLastSuccess prometheus.Gauge
5758
blocksCleanedTotal prometheus.Counter
5859
blocksFailedTotal prometheus.Counter
59-
blocksMarkedForDeletion prometheus.Counter
60+
blocksMarkedForDeletion *prometheus.CounterVec
6061
tenantBlocks *prometheus.GaugeVec
6162
tenantBlocksMarkedForDelete *prometheus.GaugeVec
6263
tenantBlocksMarkedForNoCompaction *prometheus.GaugeVec
6364
tenantPartialBlocks *prometheus.GaugeVec
6465
tenantBucketIndexLastUpdate *prometheus.GaugeVec
6566
}
6667

67-
func NewBlocksCleaner(cfg BlocksCleanerConfig, bucketClient objstore.InstrumentedBucket, usersScanner *cortex_tsdb.UsersScanner, cfgProvider ConfigProvider, logger log.Logger, reg prometheus.Registerer) *BlocksCleaner {
68+
func NewBlocksCleaner(
69+
cfg BlocksCleanerConfig,
70+
bucketClient objstore.InstrumentedBucket,
71+
usersScanner *cortex_tsdb.UsersScanner,
72+
cfgProvider ConfigProvider,
73+
logger log.Logger,
74+
reg prometheus.Registerer,
75+
blocksMarkedForDeletion *prometheus.CounterVec,
76+
) *BlocksCleaner {
6877
c := &BlocksCleaner{
6978
cfg: cfg,
7079
bucketClient: bucketClient,
@@ -95,11 +104,7 @@ func NewBlocksCleaner(cfg BlocksCleanerConfig, bucketClient objstore.Instrumente
95104
Name: "cortex_compactor_block_cleanup_failures_total",
96105
Help: "Total number of blocks failed to be deleted.",
97106
}),
98-
blocksMarkedForDeletion: promauto.With(reg).NewCounter(prometheus.CounterOpts{
99-
Name: blocksMarkedForDeletionName,
100-
Help: blocksMarkedForDeletionHelp,
101-
ConstLabels: prometheus.Labels{"reason": "retention"},
102-
}),
107+
blocksMarkedForDeletion: blocksMarkedForDeletion,
103108

104109
// The following metrics don't have the "cortex_compactor" prefix because not strictly related to
105110
// the compactor. They're just tracked by the compactor because it's the most logical place where these
@@ -374,7 +379,7 @@ func (c *BlocksCleaner) cleanUser(ctx context.Context, userID string, firstRun b
374379
// We do not want to stop the remaining work in the cleaner if an
375380
// error occurs here. Errors are logged in the function.
376381
retention := c.cfgProvider.CompactorBlocksRetentionPeriod(userID)
377-
c.applyUserRetentionPeriod(ctx, idx, retention, userBucket, userLogger)
382+
c.applyUserRetentionPeriod(ctx, idx, retention, userBucket, userLogger, userID)
378383
}
379384

380385
// Generate an updated in-memory version of the bucket index.
@@ -498,7 +503,7 @@ func (c *BlocksCleaner) cleanUserPartialBlocks(ctx context.Context, partials map
498503
}
499504

500505
// applyUserRetentionPeriod marks blocks for deletion which have aged past the retention period.
501-
func (c *BlocksCleaner) applyUserRetentionPeriod(ctx context.Context, idx *bucketindex.Index, retention time.Duration, userBucket objstore.Bucket, userLogger log.Logger) {
506+
func (c *BlocksCleaner) applyUserRetentionPeriod(ctx context.Context, idx *bucketindex.Index, retention time.Duration, userBucket objstore.Bucket, userLogger log.Logger, userID string) {
502507
// The retention period of zero is a special value indicating to never delete.
503508
if retention <= 0 {
504509
return
@@ -511,7 +516,7 @@ func (c *BlocksCleaner) applyUserRetentionPeriod(ctx context.Context, idx *bucke
511516
// the cleaner will retry applying the retention in its next cycle.
512517
for _, b := range blocks {
513518
level.Info(userLogger).Log("msg", "applied retention: marking block for deletion", "block", b.ID, "maxTime", b.MaxTime)
514-
if err := block.MarkForDeletion(ctx, userLogger, userBucket, b.ID, fmt.Sprintf("block exceeding retention of %v", retention), c.blocksMarkedForDeletion); err != nil {
519+
if err := block.MarkForDeletion(ctx, userLogger, userBucket, b.ID, fmt.Sprintf("block exceeding retention of %v", retention), c.blocksMarkedForDeletion.WithLabelValues(userID, reasonValueRetention)); err != nil {
515520
level.Warn(userLogger).Log("msg", "failed to mark block for deletion", "block", b.ID, "err", err)
516521
}
517522
}

pkg/compactor/blocks_cleaner_test.go

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/go-kit/log"
1313
"github.com/oklog/ulid"
1414
"github.com/prometheus/client_golang/prometheus"
15+
"github.com/prometheus/client_golang/prometheus/promauto"
1516
"github.com/prometheus/client_golang/prometheus/testutil"
1617
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
1718
"github.com/stretchr/testify/assert"
@@ -79,8 +80,12 @@ func TestBlockCleaner_KeyPermissionDenied(t *testing.T) {
7980
logger := log.NewNopLogger()
8081
scanner := tsdb.NewUsersScanner(mbucket, tsdb.AllUsers, logger)
8182
cfgProvider := newMockConfigProvider()
83+
blocksMarkedForDeletion := prometheus.NewCounterVec(prometheus.CounterOpts{
84+
Name: blocksMarkedForDeletionName,
85+
Help: blocksMarkedForDeletionHelp,
86+
}, append(commonLabels, ReasonLabelName))
8287

83-
cleaner := NewBlocksCleaner(cfg, mbucket, scanner, cfgProvider, logger, nil)
88+
cleaner := NewBlocksCleaner(cfg, mbucket, scanner, cfgProvider, logger, nil, blocksMarkedForDeletion)
8489

8590
// Clean User with no error
8691
cleaner.bucketClient = bkt
@@ -176,8 +181,12 @@ func testBlocksCleanerWithOptions(t *testing.T, options testBlocksCleanerOptions
176181
logger := log.NewNopLogger()
177182
scanner := tsdb.NewUsersScanner(bucketClient, tsdb.AllUsers, logger)
178183
cfgProvider := newMockConfigProvider()
184+
blocksMarkedForDeletion := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
185+
Name: blocksMarkedForDeletionName,
186+
Help: blocksMarkedForDeletionHelp,
187+
}, append(commonLabels, ReasonLabelName))
179188

180-
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, reg)
189+
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, reg, blocksMarkedForDeletion)
181190
require.NoError(t, services.StartAndAwaitRunning(ctx, cleaner))
182191
defer services.StopAndAwaitTerminated(ctx, cleaner) //nolint:errcheck
183192

@@ -333,8 +342,12 @@ func TestBlocksCleaner_ShouldContinueOnBlockDeletionFailure(t *testing.T) {
333342
logger := log.NewNopLogger()
334343
scanner := tsdb.NewUsersScanner(bucketClient, tsdb.AllUsers, logger)
335344
cfgProvider := newMockConfigProvider()
345+
blocksMarkedForDeletion := prometheus.NewCounterVec(prometheus.CounterOpts{
346+
Name: blocksMarkedForDeletionName,
347+
Help: blocksMarkedForDeletionHelp,
348+
}, append(commonLabels, ReasonLabelName))
336349

337-
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, nil)
350+
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, nil, blocksMarkedForDeletion)
338351
require.NoError(t, services.StartAndAwaitRunning(ctx, cleaner))
339352
defer services.StopAndAwaitTerminated(ctx, cleaner) //nolint:errcheck
340353

@@ -393,8 +406,12 @@ func TestBlocksCleaner_ShouldRebuildBucketIndexOnCorruptedOne(t *testing.T) {
393406
logger := log.NewNopLogger()
394407
scanner := tsdb.NewUsersScanner(bucketClient, tsdb.AllUsers, logger)
395408
cfgProvider := newMockConfigProvider()
409+
blocksMarkedForDeletion := prometheus.NewCounterVec(prometheus.CounterOpts{
410+
Name: blocksMarkedForDeletionName,
411+
Help: blocksMarkedForDeletionHelp,
412+
}, append(commonLabels, ReasonLabelName))
396413

397-
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, nil)
414+
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, nil, blocksMarkedForDeletion)
398415
require.NoError(t, services.StartAndAwaitRunning(ctx, cleaner))
399416
defer services.StopAndAwaitTerminated(ctx, cleaner) //nolint:errcheck
400417

@@ -447,8 +464,12 @@ func TestBlocksCleaner_ShouldRemoveMetricsForTenantsNotBelongingAnymoreToTheShar
447464
reg := prometheus.NewPedanticRegistry()
448465
scanner := tsdb.NewUsersScanner(bucketClient, tsdb.AllUsers, logger)
449466
cfgProvider := newMockConfigProvider()
467+
blocksMarkedForDeletion := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
468+
Name: blocksMarkedForDeletionName,
469+
Help: blocksMarkedForDeletionHelp,
470+
}, append(commonLabels, ReasonLabelName))
450471

451-
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, reg)
472+
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, reg, blocksMarkedForDeletion)
452473
require.NoError(t, cleaner.cleanUsers(ctx, true))
453474

454475
assert.NoError(t, prom_testutil.GatherAndCompare(reg, strings.NewReader(`
@@ -578,8 +599,12 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
578599
reg := prometheus.NewPedanticRegistry()
579600
scanner := tsdb.NewUsersScanner(bucketClient, tsdb.AllUsers, logger)
580601
cfgProvider := newMockConfigProvider()
602+
blocksMarkedForDeletion := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
603+
Name: blocksMarkedForDeletionName,
604+
Help: blocksMarkedForDeletionHelp,
605+
}, append(commonLabels, ReasonLabelName))
581606

582-
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, reg)
607+
cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, reg, blocksMarkedForDeletion)
583608

584609
assertBlockExists := func(user string, block ulid.ULID, expectExists bool) {
585610
exists, err := bucketClient.Exists(ctx, path.Join(user, block.String(), metadata.MetaFilename))
@@ -607,9 +632,6 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
607632
# TYPE cortex_bucket_blocks_marked_for_deletion_count gauge
608633
cortex_bucket_blocks_marked_for_deletion_count{user="user-1"} 0
609634
cortex_bucket_blocks_marked_for_deletion_count{user="user-2"} 0
610-
# HELP cortex_compactor_blocks_marked_for_deletion_total Total number of blocks marked for deletion in compactor.
611-
# TYPE cortex_compactor_blocks_marked_for_deletion_total counter
612-
cortex_compactor_blocks_marked_for_deletion_total{reason="retention"} 0
613635
`),
614636
"cortex_bucket_blocks_count",
615637
"cortex_bucket_blocks_marked_for_deletion_count",
@@ -650,7 +672,7 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
650672
cortex_bucket_blocks_marked_for_deletion_count{user="user-2"} 0
651673
# HELP cortex_compactor_blocks_marked_for_deletion_total Total number of blocks marked for deletion in compactor.
652674
# TYPE cortex_compactor_blocks_marked_for_deletion_total counter
653-
cortex_compactor_blocks_marked_for_deletion_total{reason="retention"} 1
675+
cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-1"} 1
654676
`),
655677
"cortex_bucket_blocks_count",
656678
"cortex_bucket_blocks_marked_for_deletion_count",
@@ -688,7 +710,7 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
688710
cortex_bucket_blocks_marked_for_deletion_count{user="user-2"} 0
689711
# HELP cortex_compactor_blocks_marked_for_deletion_total Total number of blocks marked for deletion in compactor.
690712
# TYPE cortex_compactor_blocks_marked_for_deletion_total counter
691-
cortex_compactor_blocks_marked_for_deletion_total{reason="retention"} 1
713+
cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-1"} 1
692714
`),
693715
"cortex_bucket_blocks_count",
694716
"cortex_bucket_blocks_marked_for_deletion_count",
@@ -717,7 +739,8 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
717739
cortex_bucket_blocks_marked_for_deletion_count{user="user-2"} 0
718740
# HELP cortex_compactor_blocks_marked_for_deletion_total Total number of blocks marked for deletion in compactor.
719741
# TYPE cortex_compactor_blocks_marked_for_deletion_total counter
720-
cortex_compactor_blocks_marked_for_deletion_total{reason="retention"} 3
742+
cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-1"} 1
743+
cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-2"} 2
721744
`),
722745
"cortex_bucket_blocks_count",
723746
"cortex_bucket_blocks_marked_for_deletion_count",

0 commit comments

Comments
 (0)