Skip to content

Commit 6be4779

Browse files
authored
Backfilling the new limits when updating the metrics (#5955)
* Backfilling the new limits when updating the metrics Signed-off-by: alanprot <[email protected]> * make clean-white-noise Signed-off-by: alanprot <[email protected]> --------- Signed-off-by: alanprot <[email protected]>
1 parent be4dd02 commit 6be4779

File tree

5 files changed

+114
-66
lines changed

5 files changed

+114
-66
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
* [ENHANCEMENT] Distributor/Querier: Clean stale per-ingester metrics after ingester restarts. #5930
99
* [ENHANCEMENT] Distributor/Ring: Allow disabling detailed ring metrics by ring member. #5931
1010
* [ENHANCEMENT] KV: Etcd Added etcd.ping-without-stream-allowed parameter to disable/enable PermitWithoutStream #5933
11+
* [ENHANCEMENT] Ingester: Add a new `max_series_per_label_set` limit. This limit functions similarly to `max_series_per_metric`, but allowing users to define the maximum number of series per LabelSet. #5950
1112
* [CHANGE] Upgrade Dockerfile Node version from 14x to 18x. #5906
1213
* [CHANGE] Query Frontend/Ruler: Omit empty data field in API response. #5953 #5954
1314
* [BUGFIX] Configsdb: Fix endline issue in db password. #5920

pkg/ingester/ingester.go

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ func (i *Ingester) updateLoop(ctx context.Context) error {
877877
i.stoppedMtx.RUnlock()
878878

879879
case <-activeSeriesTickerChan:
880-
i.updateActiveSeries()
880+
i.updateActiveSeries(ctx)
881881
case <-maxInflightRequestResetTicker.C:
882882
i.maxInflightQueryRequests.Tick()
883883
case <-userTSDBConfigTicker.C:
@@ -929,7 +929,7 @@ func (i *Ingester) getMaxExemplars(userID string) int64 {
929929
return int64(maxExemplarsFromLimits)
930930
}
931931

932-
func (i *Ingester) updateActiveSeries() {
932+
func (i *Ingester) updateActiveSeries(ctx context.Context) {
933933
purgeTime := time.Now().Add(-i.cfg.ActiveSeriesMetricsIdleTimeout)
934934

935935
for _, userID := range i.getTSDBUsers() {
@@ -940,7 +940,9 @@ func (i *Ingester) updateActiveSeries() {
940940

941941
userDB.activeSeries.Purge(purgeTime)
942942
i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(userDB.activeSeries.Active()))
943-
userDB.labelSetCounter.UpdateMetric(userDB, i.metrics.activeSeriesPerLabelSet)
943+
if err := userDB.labelSetCounter.UpdateMetric(ctx, userDB, i.metrics.activeSeriesPerLabelSet); err != nil {
944+
level.Warn(i.logger).Log("msg", "failed to update per labelSet metrics", "user", userID, "err", err)
945+
}
944946
}
945947
}
946948

@@ -1054,18 +1056,19 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
10541056
// Keep track of some stats which are tracked only if the samples will be
10551057
// successfully committed
10561058
var (
1057-
succeededSamplesCount = 0
1058-
failedSamplesCount = 0
1059-
succeededExemplarsCount = 0
1060-
failedExemplarsCount = 0
1061-
startAppend = time.Now()
1062-
sampleOutOfBoundsCount = 0
1063-
sampleOutOfOrderCount = 0
1064-
sampleTooOldCount = 0
1065-
newValueForTimestampCount = 0
1066-
perUserSeriesLimitCount = 0
1067-
perMetricSeriesLimitCount = 0
1068-
nativeHistogramCount = 0
1059+
succeededSamplesCount = 0
1060+
failedSamplesCount = 0
1061+
succeededExemplarsCount = 0
1062+
failedExemplarsCount = 0
1063+
startAppend = time.Now()
1064+
sampleOutOfBoundsCount = 0
1065+
sampleOutOfOrderCount = 0
1066+
sampleTooOldCount = 0
1067+
newValueForTimestampCount = 0
1068+
perUserSeriesLimitCount = 0
1069+
perLabelSetSeriesLimitCount = 0
1070+
perMetricSeriesLimitCount = 0
1071+
nativeHistogramCount = 0
10691072

10701073
updateFirstPartial = func(errFn func() error) {
10711074
if firstPartialErr == nil {
@@ -1150,6 +1153,7 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
11501153
})
11511154
continue
11521155
case errors.As(cause, &errMaxSeriesPerLabelSetLimitExceeded{}):
1156+
perLabelSetSeriesLimitCount++
11531157
updateFirstPartial(func() error {
11541158
return makeMetricLimitError(perLabelsetSeriesLimit, copiedLabels, i.limiter.FormatError(userID, cause))
11551159
})
@@ -1245,6 +1249,9 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
12451249
if perMetricSeriesLimitCount > 0 {
12461250
validation.DiscardedSamples.WithLabelValues(perMetricSeriesLimit, userID).Add(float64(perMetricSeriesLimitCount))
12471251
}
1252+
if perLabelSetSeriesLimitCount > 0 {
1253+
validation.DiscardedSamples.WithLabelValues(perLabelsetSeriesLimit, userID).Add(float64(perLabelSetSeriesLimitCount))
1254+
}
12481255

12491256
if nativeHistogramCount > 0 {
12501257
validation.DiscardedSamples.WithLabelValues(nativeHistogramSample, userID).Add(float64(nativeHistogramCount))

pkg/ingester/ingester_test.go

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
109109
require.NoError(t, os.Mkdir(blocksDir, os.ModePerm))
110110

111111
ing, err := prepareIngesterWithBlocksStorageAndLimits(t, defaultIngesterTestConfig(t), limits, tenantLimits, blocksDir, registry)
112+
registry.MustRegister(validation.DiscardedSamples)
112113
require.NoError(t, err)
113114
require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing))
114115
// Wait until it's ACTIVE
@@ -132,13 +133,13 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
132133
}
133134
}
134135

135-
ing.updateActiveSeries()
136+
ing.updateActiveSeries(ctx)
136137
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
137138
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
138139
# TYPE cortex_ingester_active_series_per_labelset gauge
139140
cortex_ingester_active_series_per_labelset{labelset="{label1=\"value1\"}",user="1"} 3
140141
cortex_ingester_active_series_per_labelset{labelset="{label2=\"value2\"}",user="1"} 2
141-
`), "cortex_ingester_active_series_per_labelset"))
142+
`), "cortex_ingester_active_series_per_labelset", "cortex_discarded_samples_total"))
142143

143144
// Should impose limits
144145
for _, set := range limits.MaxSeriesPerLabelSet {
@@ -154,13 +155,16 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
154155
require.ErrorContains(t, err, set.Id)
155156
}
156157

157-
ing.updateActiveSeries()
158+
ing.updateActiveSeries(ctx)
158159
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
160+
# HELP cortex_discarded_samples_total The total number of samples that were discarded.
161+
# TYPE cortex_discarded_samples_total counter
162+
cortex_discarded_samples_total{reason="per_labelset_series_limit",user="1"} 2
159163
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
160164
# TYPE cortex_ingester_active_series_per_labelset gauge
161165
cortex_ingester_active_series_per_labelset{labelset="{label1=\"value1\"}",user="1"} 3
162166
cortex_ingester_active_series_per_labelset{labelset="{label2=\"value2\"}",user="1"} 2
163-
`), "cortex_ingester_active_series_per_labelset"))
167+
`), "cortex_ingester_active_series_per_labelset", "cortex_discarded_samples_total"))
164168

165169
// Should apply composite limits
166170
limits.MaxSeriesPerLabelSet = append(limits.MaxSeriesPerLabelSet,
@@ -187,6 +191,21 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
187191
require.NoError(t, limits.UnmarshalJSON(b))
188192
tenantLimits.setLimits(userID, &limits)
189193

194+
// Should backfill
195+
ing.updateActiveSeries(ctx)
196+
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
197+
# HELP cortex_discarded_samples_total The total number of samples that were discarded.
198+
# TYPE cortex_discarded_samples_total counter
199+
cortex_discarded_samples_total{reason="per_labelset_series_limit",user="1"} 2
200+
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
201+
# TYPE cortex_ingester_active_series_per_labelset gauge
202+
cortex_ingester_active_series_per_labelset{labelset="{comp1=\"compValue1\", comp2=\"compValue2\"}",user="1"} 0
203+
cortex_ingester_active_series_per_labelset{labelset="{comp1=\"compValue1\"}",user="1"} 0
204+
cortex_ingester_active_series_per_labelset{labelset="{comp2=\"compValue2\"}",user="1"} 0
205+
cortex_ingester_active_series_per_labelset{labelset="{label1=\"value1\"}",user="1"} 3
206+
cortex_ingester_active_series_per_labelset{labelset="{label2=\"value2\"}",user="1"} 2
207+
`), "cortex_ingester_active_series_per_labelset", "cortex_discarded_samples_total"))
208+
190209
// Adding 5 metrics with only 1 label
191210
for i := 0; i < 5; i++ {
192211
lbls := []string{labels.MetricName, "metric_name", "comp1", "compValue1"}
@@ -211,16 +230,19 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
211230
assert.Equal(t, http.StatusBadRequest, int(httpResp.Code))
212231
require.ErrorContains(t, err, labels.FromStrings("comp1", "compValue1", "comp2", "compValue2").String())
213232

214-
ing.updateActiveSeries()
233+
ing.updateActiveSeries(ctx)
215234
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
235+
# HELP cortex_discarded_samples_total The total number of samples that were discarded.
236+
# TYPE cortex_discarded_samples_total counter
237+
cortex_discarded_samples_total{reason="per_labelset_series_limit",user="1"} 3
216238
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
217239
# TYPE cortex_ingester_active_series_per_labelset gauge
218240
cortex_ingester_active_series_per_labelset{labelset="{label1=\"value1\"}",user="1"} 3
219241
cortex_ingester_active_series_per_labelset{labelset="{label2=\"value2\"}",user="1"} 2
220242
cortex_ingester_active_series_per_labelset{labelset="{comp1=\"compValue1\", comp2=\"compValue2\"}",user="1"} 2
221243
cortex_ingester_active_series_per_labelset{labelset="{comp1=\"compValue1\"}",user="1"} 7
222244
cortex_ingester_active_series_per_labelset{labelset="{comp2=\"compValue2\"}",user="1"} 2
223-
`), "cortex_ingester_active_series_per_labelset"))
245+
`), "cortex_ingester_active_series_per_labelset", "cortex_discarded_samples_total"))
224246

225247
// Should bootstrap and apply limits when configuration change
226248
limits.MaxSeriesPerLabelSet = append(limits.MaxSeriesPerLabelSet,
@@ -249,7 +271,7 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
249271
assert.Equal(t, http.StatusBadRequest, int(httpResp.Code))
250272
require.ErrorContains(t, err, labels.FromStrings(lbls...).String())
251273

252-
ing.updateActiveSeries()
274+
ing.updateActiveSeries(ctx)
253275
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
254276
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
255277
# TYPE cortex_ingester_active_series_per_labelset gauge
@@ -267,7 +289,7 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
267289
require.NoError(t, err)
268290
require.NoError(t, limits.UnmarshalJSON(b))
269291
tenantLimits.setLimits(userID, &limits)
270-
ing.updateActiveSeries()
292+
ing.updateActiveSeries(ctx)
271293
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
272294
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
273295
# TYPE cortex_ingester_active_series_per_labelset gauge
@@ -281,7 +303,7 @@ func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
281303
ing, err = prepareIngesterWithBlocksStorageAndLimits(t, defaultIngesterTestConfig(t), limits, tenantLimits, blocksDir, registry)
282304
require.NoError(t, err)
283305
require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing))
284-
ing.updateActiveSeries()
306+
ing.updateActiveSeries(ctx)
285307
require.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(`
286308
# HELP cortex_ingester_active_series_per_labelset Number of currently active series per user and labelset.
287309
# TYPE cortex_ingester_active_series_per_labelset gauge
@@ -1207,7 +1229,7 @@ func TestIngester_Push(t *testing.T) {
12071229

12081230
// Update active series for metrics check.
12091231
if !testData.disableActiveSeries {
1210-
i.updateActiveSeries()
1232+
i.updateActiveSeries(ctx)
12111233
}
12121234

12131235
// Append additional metrics to assert on.
@@ -1274,7 +1296,7 @@ func TestIngester_Push_ShouldCorrectlyTrackMetricsInMultiTenantScenario(t *testi
12741296
}
12751297

12761298
// Update active series for metrics check.
1277-
i.updateActiveSeries()
1299+
i.updateActiveSeries(context.Background())
12781300

12791301
// Check tracked Prometheus metrics
12801302
expectedMetrics := `
@@ -1361,7 +1383,7 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) {
13611383
time.Sleep(200 * time.Millisecond)
13621384

13631385
// Update active series for metrics check. This will remove inactive series.
1364-
i.updateActiveSeries()
1386+
i.updateActiveSeries(context.Background())
13651387

13661388
// Check tracked Prometheus metrics
13671389
expectedMetrics := `
@@ -3733,7 +3755,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) {
37333755
})
37343756

37353757
pushSingleSampleWithMetadata(t, i)
3736-
i.updateActiveSeries()
3758+
i.updateActiveSeries(context.Background())
37373759

37383760
require.Equal(t, int64(1), i.TSDBState.seriesCount.Load())
37393761

@@ -3774,7 +3796,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) {
37743796
})
37753797

37763798
require.Greater(t, testutil.ToFloat64(i.TSDBState.idleTsdbChecks.WithLabelValues(string(tsdbIdleClosed))), float64(0))
3777-
i.updateActiveSeries()
3799+
i.updateActiveSeries(context.Background())
37783800
require.Equal(t, int64(0), i.TSDBState.seriesCount.Load()) // Flushing removed all series from memory.
37793801

37803802
// Verify that user has disappeared from metrics.
@@ -3799,7 +3821,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) {
37993821

38003822
// Pushing another sample will recreate TSDB.
38013823
pushSingleSampleWithMetadata(t, i)
3802-
i.updateActiveSeries()
3824+
i.updateActiveSeries(context.Background())
38033825

38043826
// User is back.
38053827
require.NoError(t, testutil.GatherAndCompare(r, strings.NewReader(`

pkg/ingester/limiter.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ func (l *Limiter) AssertMaxMetricsWithMetadataPerUser(userID string, metrics int
105105
return errMaxMetadataPerUserLimitExceeded
106106
}
107107

108+
// AssertMaxSeriesPerLabelSet limit has not been reached compared to the current
109+
// number of metrics with metadata in input and returns an error if so.
108110
func (l *Limiter) AssertMaxSeriesPerLabelSet(userID string, metric labels.Labels, f func(validation.MaxSeriesPerLabelSet) (int, error)) error {
109111
m := l.maxSeriesPerLabelSet(userID, metric)
110112
for _, limit := range m {

pkg/ingester/user_state.go

Lines changed: 52 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -125,45 +125,49 @@ func (m *labelSetCounter) canAddSeriesForLabelSet(ctx context.Context, u *userTS
125125
s.RUnlock()
126126

127127
// We still dont keep track of this label value so we need to backfill
128-
ir, err := u.db.Head().Index()
129-
if err != nil {
130-
return 0, err
131-
}
128+
return m.backFillLimit(ctx, u, set, s)
129+
})
130+
}
132131

133-
defer ir.Close()
132+
func (m *labelSetCounter) backFillLimit(ctx context.Context, u *userTSDB, limit validation.MaxSeriesPerLabelSet, s *labelSetCounterShard) (int, error) {
133+
ir, err := u.db.Head().Index()
134+
if err != nil {
135+
return 0, err
136+
}
134137

135-
s.Lock()
136-
defer s.Unlock()
137-
if r, ok := s.valuesCounter[set.Hash]; !ok {
138-
postings := make([]index.Postings, 0, len(set.LabelSet))
139-
for _, lbl := range set.LabelSet {
140-
p, err := ir.Postings(ctx, lbl.Name, lbl.Value)
141-
if err != nil {
142-
return 0, err
143-
}
144-
postings = append(postings, p)
138+
defer ir.Close()
139+
140+
s.Lock()
141+
defer s.Unlock()
142+
if r, ok := s.valuesCounter[limit.Hash]; !ok {
143+
postings := make([]index.Postings, 0, len(limit.LabelSet))
144+
for _, lbl := range limit.LabelSet {
145+
p, err := ir.Postings(ctx, lbl.Name, lbl.Value)
146+
if err != nil {
147+
return 0, err
145148
}
149+
postings = append(postings, p)
150+
}
146151

147-
p := index.Intersect(postings...)
152+
p := index.Intersect(postings...)
148153

149-
totalCount := 0
150-
for p.Next() {
151-
totalCount++
152-
}
154+
totalCount := 0
155+
for p.Next() {
156+
totalCount++
157+
}
153158

154-
if p.Err() != nil {
155-
return 0, p.Err()
156-
}
159+
if p.Err() != nil {
160+
return 0, p.Err()
161+
}
157162

158-
s.valuesCounter[set.Hash] = &labelSetCounterEntry{
159-
count: totalCount,
160-
labels: set.LabelSet,
161-
}
162-
return totalCount, nil
163-
} else {
164-
return r.count, nil
163+
s.valuesCounter[limit.Hash] = &labelSetCounterEntry{
164+
count: totalCount,
165+
labels: limit.LabelSet,
165166
}
166-
})
167+
return totalCount, nil
168+
} else {
169+
return r.count, nil
170+
}
167171
}
168172

169173
func (m *labelSetCounter) increaseSeriesLabelSet(u *userTSDB, metric labels.Labels) {
@@ -195,24 +199,36 @@ func (m *labelSetCounter) decreaseSeriesLabelSet(u *userTSDB, metric labels.Labe
195199
}
196200
}
197201

198-
func (m *labelSetCounter) UpdateMetric(u *userTSDB, vec *prometheus.GaugeVec) {
199-
currentLbsLimitHash := map[uint64]struct{}{}
202+
func (m *labelSetCounter) UpdateMetric(ctx context.Context, u *userTSDB, vec *prometheus.GaugeVec) error {
203+
currentLbsLimitHash := map[uint64]validation.MaxSeriesPerLabelSet{}
200204
for _, l := range m.limiter.limits.MaxSeriesPerLabelSet(u.userID) {
201-
currentLbsLimitHash[l.Hash] = struct{}{}
205+
currentLbsLimitHash[l.Hash] = l
202206
}
203207

204208
for i := 0; i < numMetricCounterShards; i++ {
205209
s := m.shards[i]
206210
s.RLock()
207211
for h, entry := range s.valuesCounter {
208-
// This limit no longer ecists
212+
// This limit no longer exists
209213
if _, ok := currentLbsLimitHash[h]; !ok {
210214
vec.DeleteLabelValues(u.userID, entry.labels.String())
211215
continue
212216
}
213-
217+
delete(currentLbsLimitHash, h)
214218
vec.WithLabelValues(u.userID, entry.labels.String()).Set(float64(entry.count))
215219
}
216220
s.RUnlock()
217221
}
222+
223+
// Backfill all limits that are not being tracked yet
224+
for _, l := range currentLbsLimitHash {
225+
s := m.shards[util.HashFP(model.Fingerprint(l.Hash))%numMetricCounterShards]
226+
count, err := m.backFillLimit(ctx, u, l, s)
227+
if err != nil {
228+
return err
229+
}
230+
vec.WithLabelValues(u.userID, l.LabelSet.String()).Set(float64(count))
231+
}
232+
233+
return nil
218234
}

0 commit comments

Comments
 (0)