Skip to content

Commit 2948539

Browse files
authored
Proposal: Create a new "Per LabelSet" limit (#5950)
* Creating Limits per LabelSet Signed-off-by: alanprot <[email protected]> * lint Signed-off-by: alanprot <[email protected]> * fix test Signed-off-by: alanprot <[email protected]> * doc Signed-off-by: alanprot <[email protected]> --------- Signed-off-by: alanprot <[email protected]>
1 parent 2527f9e commit 2948539

File tree

8 files changed

+557
-46
lines changed

8 files changed

+557
-46
lines changed

docs/configuration/config-file-reference.md

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3168,6 +3168,10 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
31683168
# CLI flag: -ingester.max-global-series-per-metric
31693169
[max_global_series_per_metric: <int> | default = 0]
31703170
3171+
# [Experimental] The maximum number of active series per LabelSet, across the
3172+
# cluster before replication. Empty list to disable.
3173+
[max_series_per_label_set: <list of MaxSeriesPerLabelSet> | default = []]
3174+
31713175
# The maximum number of active metrics with metadata per user, per ingester. 0
31723176
# to disable.
31733177
# CLI flag: -ingester.max-metadata-per-user
@@ -4009,7 +4013,7 @@ The `ruler_config` configures the Cortex ruler.
40094013
[external_url: <url> | default = ]
40104014
40114015
# Labels to add to all alerts.
4012-
[external_labels: <list of Label> | default = []]
4016+
[external_labels: <map of string (labelName) to string (labelValue)> | default = []]
40134017
40144018
ruler_client:
40154019
# gRPC client max receive message size (bytes).
@@ -5306,6 +5310,16 @@ otel:
53065310
[tls_insecure_skip_verify: <boolean> | default = false]
53075311
```
53085312
5313+
### `MaxSeriesPerLabelSet`
5314+
5315+
```yaml
5316+
# The maximum number of active series per LabelSet before replication.
5317+
[limit: <int> | default = ]
5318+
5319+
# LabelSet which the limit should be applied.
5320+
[label_set: <map of string (labelName) to string (labelValue)> | default = []]
5321+
```
5322+
53095323
### `PriorityDef`
53105324

53115325
```yaml
@@ -5350,11 +5364,3 @@ time_window:
53505364
# name of the rule group
53515365
[name: <string> | default = ""]
53525366
```
5353-
5354-
### `Label`
5355-
5356-
```yaml
5357-
[name: <string> | default = ""]
5358-
5359-
[value: <string> | default = ""]
5360-
```

pkg/ingester/ingester.go

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,12 @@ func (r tsdbCloseCheckResult) shouldClose() bool {
254254
}
255255

256256
type userTSDB struct {
257-
db *tsdb.DB
258-
userID string
259-
activeSeries *ActiveSeries
260-
seriesInMetric *metricCounter
261-
limiter *Limiter
257+
db *tsdb.DB
258+
userID string
259+
activeSeries *ActiveSeries
260+
seriesInMetric *metricCounter
261+
labelSetCounter *labelSetCounter
262+
limiter *Limiter
262263

263264
instanceSeriesCount *atomic.Int64 // Shared across all userTSDB instances created by ingester.
264265
instanceLimitsFn func() *InstanceLimits
@@ -399,6 +400,10 @@ func (u *userTSDB) PreCreation(metric labels.Labels) error {
399400
return err
400401
}
401402

403+
if err := u.labelSetCounter.canAddSeriesForLabelSet(context.TODO(), u, metric); err != nil {
404+
return err
405+
}
406+
402407
return nil
403408
}
404409

@@ -412,6 +417,7 @@ func (u *userTSDB) PostCreation(metric labels.Labels) {
412417
return
413418
}
414419
u.seriesInMetric.increaseSeriesForMetric(metricName)
420+
u.labelSetCounter.increaseSeriesLabelSet(u, metric)
415421
}
416422

417423
// PostDeletion implements SeriesLifecycleCallback interface.
@@ -425,6 +431,7 @@ func (u *userTSDB) PostDeletion(metrics map[chunks.HeadSeriesRef]labels.Labels)
425431
continue
426432
}
427433
u.seriesInMetric.decreaseSeriesForMetric(metricName)
434+
u.labelSetCounter.decreaseSeriesLabelSet(u, metric)
428435
}
429436
}
430437

@@ -713,6 +720,15 @@ func NewForFlusher(cfg Config, limits *validation.Overrides, registerer promethe
713720
TSDBState: newTSDBState(bucketClient, registerer),
714721
logger: logger,
715722
}
723+
i.limiter = NewLimiter(
724+
limits,
725+
i.lifecycler,
726+
cfg.DistributorShardingStrategy,
727+
cfg.DistributorShardByAllLabels,
728+
cfg.LifecyclerConfig.RingConfig.ReplicationFactor,
729+
cfg.LifecyclerConfig.RingConfig.ZoneAwarenessEnabled,
730+
cfg.AdminLimitMessage,
731+
)
716732
i.metrics = newIngesterMetrics(registerer,
717733
false,
718734
false,
@@ -924,6 +940,7 @@ func (i *Ingester) updateActiveSeries() {
924940

925941
userDB.activeSeries.Purge(purgeTime)
926942
i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(userDB.activeSeries.Active()))
943+
userDB.labelSetCounter.UpdateMetric(userDB, i.metrics.activeSeriesPerLabelSet)
927944
}
928945
}
929946

@@ -1100,38 +1117,43 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
11001117
// of it, so that we can return it back to the distributor, which will return a
11011118
// 400 error to the client. The client (Prometheus) will not retry on 400, and
11021119
// we actually ingested all samples which haven't failed.
1103-
switch cause := errors.Cause(err); cause {
1104-
case storage.ErrOutOfBounds:
1120+
switch cause := errors.Cause(err); {
1121+
case errors.Is(cause, storage.ErrOutOfBounds):
11051122
sampleOutOfBoundsCount++
11061123
updateFirstPartial(func() error { return wrappedTSDBIngestErr(err, model.Time(s.TimestampMs), ts.Labels) })
11071124
continue
11081125

1109-
case storage.ErrOutOfOrderSample:
1126+
case errors.Is(cause, storage.ErrOutOfOrderSample):
11101127
sampleOutOfOrderCount++
11111128
updateFirstPartial(func() error { return wrappedTSDBIngestErr(err, model.Time(s.TimestampMs), ts.Labels) })
11121129
continue
11131130

1114-
case storage.ErrDuplicateSampleForTimestamp:
1131+
case errors.Is(cause, storage.ErrDuplicateSampleForTimestamp):
11151132
newValueForTimestampCount++
11161133
updateFirstPartial(func() error { return wrappedTSDBIngestErr(err, model.Time(s.TimestampMs), ts.Labels) })
11171134
continue
11181135

1119-
case storage.ErrTooOldSample:
1136+
case errors.Is(cause, storage.ErrTooOldSample):
11201137
sampleTooOldCount++
11211138
updateFirstPartial(func() error { return wrappedTSDBIngestErr(err, model.Time(s.TimestampMs), ts.Labels) })
11221139
continue
11231140

1124-
case errMaxSeriesPerUserLimitExceeded:
1141+
case errors.Is(cause, errMaxSeriesPerUserLimitExceeded):
11251142
perUserSeriesLimitCount++
11261143
updateFirstPartial(func() error { return makeLimitError(perUserSeriesLimit, i.limiter.FormatError(userID, cause)) })
11271144
continue
11281145

1129-
case errMaxSeriesPerMetricLimitExceeded:
1146+
case errors.Is(cause, errMaxSeriesPerMetricLimitExceeded):
11301147
perMetricSeriesLimitCount++
11311148
updateFirstPartial(func() error {
11321149
return makeMetricLimitError(perMetricSeriesLimit, copiedLabels, i.limiter.FormatError(userID, cause))
11331150
})
11341151
continue
1152+
case errors.As(cause, &errMaxSeriesPerLabelSetLimitExceeded{}):
1153+
updateFirstPartial(func() error {
1154+
return makeMetricLimitError(perLabelsetSeriesLimit, copiedLabels, i.limiter.FormatError(userID, cause))
1155+
})
1156+
continue
11351157
}
11361158

11371159
// The error looks an issue on our side, so we should rollback
@@ -2018,6 +2040,7 @@ func (i *Ingester) createTSDB(userID string) (*userTSDB, error) {
20182040
userID: userID,
20192041
activeSeries: NewActiveSeries(),
20202042
seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()),
2043+
labelSetCounter: newLabelSetCounter(i.limiter),
20212044
ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod),
20222045
ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod),
20232046

0 commit comments

Comments
 (0)