Skip to content

Commit 5356796

Browse files
authored
[Store Gateway] Token bucket limiter (#6016)
* Create TokenBucket Signed-off-by: Justin Jung <[email protected]> * Update bucket stores to pass token bucket Signed-off-by: Justin Jung <[email protected]> * Move limiters to a new file Signed-off-by: Justin Jung <[email protected]> * Added tests for limiters and token bucket Signed-off-by: Justin Jung <[email protected]> * Add more tests Signed-off-by: Justin Jung <[email protected]> * Added enable flag Signed-off-by: Justin Jung <[email protected]> * Add dryrun feature Signed-off-by: Justin Jung <[email protected]> * Add doc Signed-off-by: Justin Jung <[email protected]> * Add changelog Signed-off-by: Justin Jung <[email protected]> * Lint Signed-off-by: Justin Jung <[email protected]> * Do not create pod token bucket if the feature is not enabled Signed-off-by: Justin Jung <[email protected]> * More docs Signed-off-by: Justin Jung <[email protected]> * Address comments Signed-off-by: Justin Jung <[email protected]> * Rename podTokenBucket to instanceTokenBucket Signed-off-by: Justin Jung <[email protected]> * Updated default values Signed-off-by: Justin Jung <[email protected]> * Rename TokenBucketLimiter to TokenBucketBytesLimiter Signed-off-by: Justin Jung <[email protected]> * Changed error to httpgrpc Signed-off-by: Justin Jung <[email protected]> * Nit Signed-off-by: Justin Jung <[email protected]> * Increment failure metric when token bucket returns error Signed-off-by: Justin Jung <[email protected]> * Simplify token bucket by making Retrieve to always deduct token Signed-off-by: Justin Jung <[email protected]> * Throw 429 and 422 for different failure scenarios Signed-off-by: Justin Jung <[email protected]> * Hide token factors from doc Signed-off-by: Justin Jung <[email protected]> * Simplified config by combining dryrun and enabled Signed-off-by: Justin Jung <[email protected]> * Remove test log Signed-off-by: Justin Jung <[email protected]> * Fix tests Signed-off-by: Justin Jung <[email protected]> * Fix Signed-off-by: Justin Jung <[email protected]> --------- Signed-off-by: Justin Jung <[email protected]>
1 parent 42d7327 commit 5356796

File tree

11 files changed

+625
-49
lines changed

11 files changed

+625
-49
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* [FEATURE] OTLP: Support ingesting OTLP exponential metrics as native histograms. #6071
1111
* [FEATURE] Ingester: Add `ingester.instance-limits.max-inflight-query-requests` to allow limiting ingester concurrent queries. #6081
1212
* [FEATURE] Distributor: Add `validation.max-native-histogram-buckets` to limit max number of bucket count. Distributor will try to automatically reduce histogram resolution until it is within the bucket limit or resolution cannot be reduced anymore. #6104
13+
* [FEATURE] Store Gateway: Token bucket limiter. #6016
1314
* [ENHANCEMENT] rulers: Add support to persist tokens in rulers. #5987
1415
* [ENHANCEMENT] Query Frontend/Querier: Added store gateway postings touched count and touched size in Querier stats and log in Query Frontend. #5892
1516
* [ENHANCEMENT] Query Frontend/Querier: Returns `warnings` on prometheus query responses. #5916

docs/blocks-storage/querier.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,24 @@ blocks_storage:
13411341
# CLI flag: -blocks-storage.bucket-store.series-batch-size
13421342
[series_batch_size: <int> | default = 10000]
13431343

1344+
token_bucket_bytes_limiter:
1345+
# Token bucket bytes limiter mode. Supported values are: disabled, dryrun,
1346+
# enabled
1347+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.mode
1348+
[mode: <string> | default = "disabled"]
1349+
1350+
# Instance token bucket size
1351+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.instance-token-bucket-size
1352+
[instance_token_bucket_size: <int> | default = 859832320]
1353+
1354+
# User token bucket size
1355+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.user-token-bucket-size
1356+
[user_token_bucket_size: <int> | default = 644874240]
1357+
1358+
# Request token bucket size
1359+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.request-token-bucket-size
1360+
[request_token_bucket_size: <int> | default = 4194304]
1361+
13441362
tsdb:
13451363
# Local directory to store TSDBs in the ingesters.
13461364
# CLI flag: -blocks-storage.tsdb.dir

docs/blocks-storage/store-gateway.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1466,6 +1466,24 @@ blocks_storage:
14661466
# CLI flag: -blocks-storage.bucket-store.series-batch-size
14671467
[series_batch_size: <int> | default = 10000]
14681468

1469+
token_bucket_bytes_limiter:
1470+
# Token bucket bytes limiter mode. Supported values are: disabled, dryrun,
1471+
# enabled
1472+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.mode
1473+
[mode: <string> | default = "disabled"]
1474+
1475+
# Instance token bucket size
1476+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.instance-token-bucket-size
1477+
[instance_token_bucket_size: <int> | default = 859832320]
1478+
1479+
# User token bucket size
1480+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.user-token-bucket-size
1481+
[user_token_bucket_size: <int> | default = 644874240]
1482+
1483+
# Request token bucket size
1484+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.request-token-bucket-size
1485+
[request_token_bucket_size: <int> | default = 4194304]
1486+
14691487
tsdb:
14701488
# Local directory to store TSDBs in the ingesters.
14711489
# CLI flag: -blocks-storage.tsdb.dir

docs/configuration/config-file-reference.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,6 +1899,24 @@ bucket_store:
18991899
# CLI flag: -blocks-storage.bucket-store.series-batch-size
19001900
[series_batch_size: <int> | default = 10000]
19011901

1902+
token_bucket_bytes_limiter:
1903+
# Token bucket bytes limiter mode. Supported values are: disabled, dryrun,
1904+
# enabled
1905+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.mode
1906+
[mode: <string> | default = "disabled"]
1907+
1908+
# Instance token bucket size
1909+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.instance-token-bucket-size
1910+
[instance_token_bucket_size: <int> | default = 859832320]
1911+
1912+
# User token bucket size
1913+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.user-token-bucket-size
1914+
[user_token_bucket_size: <int> | default = 644874240]
1915+
1916+
# Request token bucket size
1917+
# CLI flag: -blocks-storage.bucket-store.token-bucket-bytes-limiter.request-token-bucket-size
1918+
[request_token_bucket_size: <int> | default = 4194304]
1919+
19021920
tsdb:
19031921
# Local directory to store TSDBs in the ingesters.
19041922
# CLI flag: -blocks-storage.tsdb.dir

pkg/storage/tsdb/config.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tsdb
22

33
import (
44
"flag"
5+
"fmt"
56
"path/filepath"
67
"strings"
78
"time"
@@ -52,6 +53,7 @@ var (
5253

5354
ErrInvalidBucketIndexBlockDiscoveryStrategy = errors.New("bucket index block discovery strategy can only be enabled when bucket index is enabled")
5455
ErrBlockDiscoveryStrategy = errors.New("invalid block discovery strategy")
56+
ErrInvalidTokenBucketBytesLimiterMode = errors.New("invalid token bucket bytes limiter mode")
5557
)
5658

5759
// BlocksStorageConfig holds the config information for the blocks storage.
@@ -292,6 +294,22 @@ type BucketStoreConfig struct {
292294

293295
// Controls how many series to fetch per batch in Store Gateway. Default value is 10000.
294296
SeriesBatchSize int `yaml:"series_batch_size"`
297+
298+
// Token bucket configs
299+
TokenBucketBytesLimiter TokenBucketBytesLimiterConfig `yaml:"token_bucket_bytes_limiter"`
300+
}
301+
302+
type TokenBucketBytesLimiterConfig struct {
303+
Mode string `yaml:"mode"`
304+
InstanceTokenBucketSize int64 `yaml:"instance_token_bucket_size"`
305+
UserTokenBucketSize int64 `yaml:"user_token_bucket_size"`
306+
RequestTokenBucketSize int64 `yaml:"request_token_bucket_size"`
307+
FetchedPostingsTokenFactor float64 `yaml:"fetched_postings_token_factor" doc:"hidden"`
308+
TouchedPostingsTokenFactor float64 `yaml:"touched_postings_token_factor" doc:"hidden"`
309+
FetchedSeriesTokenFactor float64 `yaml:"fetched_series_token_factor" doc:"hidden"`
310+
TouchedSeriesTokenFactor float64 `yaml:"touched_series_token_factor" doc:"hidden"`
311+
FetchedChunksTokenFactor float64 `yaml:"fetched_chunks_token_factor" doc:"hidden"`
312+
TouchedChunksTokenFactor float64 `yaml:"touched_chunks_token_factor" doc:"hidden"`
295313
}
296314

297315
// RegisterFlags registers the BucketStore flags
@@ -325,6 +343,16 @@ func (cfg *BucketStoreConfig) RegisterFlags(f *flag.FlagSet) {
325343
f.BoolVar(&cfg.LazyExpandedPostingsEnabled, "blocks-storage.bucket-store.lazy-expanded-postings-enabled", false, "If true, Store Gateway will estimate postings size and try to lazily expand postings if it downloads less data than expanding all postings.")
326344
f.IntVar(&cfg.SeriesBatchSize, "blocks-storage.bucket-store.series-batch-size", store.SeriesBatchSize, "Controls how many series to fetch per batch in Store Gateway. Default value is 10000.")
327345
f.StringVar(&cfg.BlockDiscoveryStrategy, "blocks-storage.bucket-store.block-discovery-strategy", string(ConcurrentDiscovery), "One of "+strings.Join(supportedBlockDiscoveryStrategies, ", ")+". When set to concurrent, stores will concurrently issue one call per directory to discover active blocks in the bucket. The recursive strategy iterates through all objects in the bucket, recursively traversing into each directory. This avoids N+1 calls at the expense of having slower bucket iterations. bucket_index strategy can be used in Compactor only and utilizes the existing bucket index to fetch block IDs to sync. This avoids iterating the bucket but can be impacted by delays of cleaner creating bucket index.")
346+
f.StringVar(&cfg.TokenBucketBytesLimiter.Mode, "blocks-storage.bucket-store.token-bucket-bytes-limiter.mode", string(TokenBucketBytesLimiterDisabled), fmt.Sprintf("Token bucket bytes limiter mode. Supported values are: %s", strings.Join(supportedTokenBucketBytesLimiterModes, ", ")))
347+
f.Int64Var(&cfg.TokenBucketBytesLimiter.InstanceTokenBucketSize, "blocks-storage.bucket-store.token-bucket-bytes-limiter.instance-token-bucket-size", int64(820*units.Mebibyte), "Instance token bucket size")
348+
f.Int64Var(&cfg.TokenBucketBytesLimiter.UserTokenBucketSize, "blocks-storage.bucket-store.token-bucket-bytes-limiter.user-token-bucket-size", int64(615*units.Mebibyte), "User token bucket size")
349+
f.Int64Var(&cfg.TokenBucketBytesLimiter.RequestTokenBucketSize, "blocks-storage.bucket-store.token-bucket-bytes-limiter.request-token-bucket-size", int64(4*units.Mebibyte), "Request token bucket size")
350+
f.Float64Var(&cfg.TokenBucketBytesLimiter.FetchedPostingsTokenFactor, "blocks-storage.bucket-store.token-bucket-bytes-limiter.fetched-postings-token-factor", 0, "Multiplication factor used for fetched postings token")
351+
f.Float64Var(&cfg.TokenBucketBytesLimiter.TouchedPostingsTokenFactor, "blocks-storage.bucket-store.token-bucket-bytes-limiter.touched-postings-token-factor", 5, "Multiplication factor used for touched postings token")
352+
f.Float64Var(&cfg.TokenBucketBytesLimiter.FetchedSeriesTokenFactor, "blocks-storage.bucket-store.token-bucket-bytes-limiter.fetched-series-token-factor", 0, "Multiplication factor used for fetched series token")
353+
f.Float64Var(&cfg.TokenBucketBytesLimiter.TouchedSeriesTokenFactor, "blocks-storage.bucket-store.token-bucket-bytes-limiter.touched-series-token-factor", 25, "Multiplication factor used for touched series token")
354+
f.Float64Var(&cfg.TokenBucketBytesLimiter.FetchedChunksTokenFactor, "blocks-storage.bucket-store.token-bucket-bytes-limiter.fetched-chunks-token-factor", 0, "Multiplication factor used for fetched chunks token")
355+
f.Float64Var(&cfg.TokenBucketBytesLimiter.TouchedChunksTokenFactor, "blocks-storage.bucket-store.token-bucket-bytes-limiter.touched-chunks-token-factor", 1, "Multiplication factor used for touched chunks token")
328356
}
329357

330358
// Validate the config.
@@ -344,6 +372,9 @@ func (cfg *BucketStoreConfig) Validate() error {
344372
if !util.StringsContain(supportedBlockDiscoveryStrategies, cfg.BlockDiscoveryStrategy) {
345373
return ErrInvalidBucketIndexBlockDiscoveryStrategy
346374
}
375+
if !util.StringsContain(supportedTokenBucketBytesLimiterModes, cfg.TokenBucketBytesLimiter.Mode) {
376+
return ErrInvalidTokenBucketBytesLimiterMode
377+
}
347378
return nil
348379
}
349380

@@ -375,3 +406,17 @@ var supportedBlockDiscoveryStrategies = []string{
375406
string(RecursiveDiscovery),
376407
string(BucketIndexDiscovery),
377408
}
409+
410+
type TokenBucketBytesLimiterMode string
411+
412+
const (
413+
TokenBucketBytesLimiterDisabled TokenBucketBytesLimiterMode = "disabled"
414+
TokenBucketBytesLimiterDryRun TokenBucketBytesLimiterMode = "dryrun"
415+
TokenBucketBytesLimiterEnabled TokenBucketBytesLimiterMode = "enabled"
416+
)
417+
418+
var supportedTokenBucketBytesLimiterModes = []string{
419+
string(TokenBucketBytesLimiterDisabled),
420+
string(TokenBucketBytesLimiterDryRun),
421+
string(TokenBucketBytesLimiterEnabled),
422+
}

pkg/storegateway/bucket_stores.go

Lines changed: 52 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"fmt"
66
"math"
7-
"net/http"
87
"os"
98
"path/filepath"
109
"strings"
@@ -35,6 +34,7 @@ import (
3534

3635
"github.com/cortexproject/cortex/pkg/storage/bucket"
3736
"github.com/cortexproject/cortex/pkg/storage/tsdb"
37+
"github.com/cortexproject/cortex/pkg/util"
3838
"github.com/cortexproject/cortex/pkg/util/backoff"
3939
cortex_errors "github.com/cortexproject/cortex/pkg/util/errors"
4040
util_log "github.com/cortexproject/cortex/pkg/util/log"
@@ -73,6 +73,11 @@ type BucketStores struct {
7373
storesErrorsMu sync.RWMutex
7474
storesErrors map[string]error
7575

76+
instanceTokenBucket *util.TokenBucket
77+
78+
userTokenBucketsMu sync.RWMutex
79+
userTokenBuckets map[string]*util.TokenBucket
80+
7681
// Keeps number of inflight requests
7782
inflightRequestCnt int
7883
inflightRequestMu sync.RWMutex
@@ -115,6 +120,7 @@ func NewBucketStores(cfg tsdb.BlocksStorageConfig, shardingStrategy ShardingStra
115120
metaFetcherMetrics: NewMetadataFetcherMetrics(),
116121
queryGate: queryGate,
117122
partitioner: newGapBasedPartitioner(cfg.BucketStore.PartitionerMaxGapBytes, reg),
123+
userTokenBuckets: make(map[string]*util.TokenBucket),
118124
syncTimes: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
119125
Name: "cortex_bucket_stores_blocks_sync_seconds",
120126
Help: "The total time it takes to perform a sync stores",
@@ -144,6 +150,13 @@ func NewBucketStores(cfg tsdb.BlocksStorageConfig, shardingStrategy ShardingStra
144150
return nil, errors.Wrap(err, "create chunks bytes pool")
145151
}
146152

153+
if u.cfg.BucketStore.TokenBucketBytesLimiter.Mode != string(tsdb.TokenBucketBytesLimiterDisabled) {
154+
u.instanceTokenBucket = util.NewTokenBucket(cfg.BucketStore.TokenBucketBytesLimiter.InstanceTokenBucketSize, promauto.With(reg).NewGauge(prometheus.GaugeOpts{
155+
Name: "cortex_bucket_stores_instance_token_bucket_remaining",
156+
Help: "Number of tokens left in instance token bucket.",
157+
}))
158+
}
159+
147160
if reg != nil {
148161
reg.MustRegister(u.bucketStoreMetrics, u.metaFetcherMetrics)
149162
}
@@ -475,6 +488,12 @@ func (u *BucketStores) closeEmptyBucketStore(userID string) error {
475488
unlockInDefer = false
476489
u.storesMu.Unlock()
477490

491+
if u.cfg.BucketStore.TokenBucketBytesLimiter.Mode != string(tsdb.TokenBucketBytesLimiterDisabled) {
492+
u.userTokenBucketsMu.Lock()
493+
delete(u.userTokenBuckets, userID)
494+
u.userTokenBucketsMu.Unlock()
495+
}
496+
478497
u.metaFetcherMetrics.RemoveUserRegistry(userID)
479498
u.bucketStoreMetrics.RemoveUserRegistry(userID)
480499
return bs.Close()
@@ -612,13 +631,19 @@ func (u *BucketStores) getOrCreateStore(userID string) (*store.BucketStore, erro
612631
bucketStoreOpts = append(bucketStoreOpts, store.WithDebugLogging())
613632
}
614633

634+
if u.cfg.BucketStore.TokenBucketBytesLimiter.Mode != string(tsdb.TokenBucketBytesLimiterDisabled) {
635+
u.userTokenBucketsMu.Lock()
636+
u.userTokenBuckets[userID] = util.NewTokenBucket(u.cfg.BucketStore.TokenBucketBytesLimiter.UserTokenBucketSize, nil)
637+
u.userTokenBucketsMu.Unlock()
638+
}
639+
615640
bs, err := store.NewBucketStore(
616641
userBkt,
617642
fetcher,
618643
u.syncDirForUser(userID),
619644
newChunksLimiterFactory(u.limits, userID),
620645
newSeriesLimiterFactory(u.limits, userID),
621-
newBytesLimiterFactory(u.limits, userID),
646+
newBytesLimiterFactory(u.limits, userID, u.getUserTokenBucket(userID), u.instanceTokenBucket, u.cfg.BucketStore.TokenBucketBytesLimiter, u.getTokensToRetrieve),
622647
u.partitioner,
623648
u.cfg.BucketStore.BlockSyncConcurrency,
624649
false, // No need to enable backward compatibility with Thanos pre 0.8.0 queriers
@@ -680,6 +705,31 @@ func (u *BucketStores) deleteLocalFilesForExcludedTenants(includeUserIDs map[str
680705
}
681706
}
682707

708+
func (u *BucketStores) getUserTokenBucket(userID string) *util.TokenBucket {
709+
u.userTokenBucketsMu.RLock()
710+
defer u.userTokenBucketsMu.RUnlock()
711+
return u.userTokenBuckets[userID]
712+
}
713+
714+
func (u *BucketStores) getTokensToRetrieve(tokens uint64, dataType store.StoreDataType) int64 {
715+
tokensToRetrieve := float64(tokens)
716+
switch dataType {
717+
case store.PostingsFetched:
718+
tokensToRetrieve *= u.cfg.BucketStore.TokenBucketBytesLimiter.FetchedPostingsTokenFactor
719+
case store.PostingsTouched:
720+
tokensToRetrieve *= u.cfg.BucketStore.TokenBucketBytesLimiter.TouchedPostingsTokenFactor
721+
case store.SeriesFetched:
722+
tokensToRetrieve *= u.cfg.BucketStore.TokenBucketBytesLimiter.FetchedSeriesTokenFactor
723+
case store.SeriesTouched:
724+
tokensToRetrieve *= u.cfg.BucketStore.TokenBucketBytesLimiter.TouchedSeriesTokenFactor
725+
case store.ChunksFetched:
726+
tokensToRetrieve *= u.cfg.BucketStore.TokenBucketBytesLimiter.FetchedChunksTokenFactor
727+
case store.ChunksTouched:
728+
tokensToRetrieve *= u.cfg.BucketStore.TokenBucketBytesLimiter.TouchedChunksTokenFactor
729+
}
730+
return int64(tokensToRetrieve)
731+
}
732+
683733
func getUserIDFromGRPCContext(ctx context.Context) string {
684734
meta, ok := metadata.FromIncomingContext(ctx)
685735
if !ok {
@@ -730,50 +780,3 @@ type spanSeriesServer struct {
730780
func (s spanSeriesServer) Context() context.Context {
731781
return s.ctx
732782
}
733-
734-
type limiter struct {
735-
limiter *store.Limiter
736-
}
737-
738-
func (c *limiter) Reserve(num uint64) error {
739-
return c.ReserveWithType(num, 0)
740-
}
741-
742-
func (c *limiter) ReserveWithType(num uint64, _ store.StoreDataType) error {
743-
err := c.limiter.Reserve(num)
744-
if err != nil {
745-
return httpgrpc.Errorf(http.StatusUnprocessableEntity, err.Error())
746-
}
747-
748-
return nil
749-
}
750-
751-
func newChunksLimiterFactory(limits *validation.Overrides, userID string) store.ChunksLimiterFactory {
752-
return func(failedCounter prometheus.Counter) store.ChunksLimiter {
753-
// Since limit overrides could be live reloaded, we have to get the current user's limit
754-
// each time a new limiter is instantiated.
755-
return &limiter{
756-
limiter: store.NewLimiter(uint64(limits.MaxChunksPerQueryFromStore(userID)), failedCounter),
757-
}
758-
}
759-
}
760-
761-
func newSeriesLimiterFactory(limits *validation.Overrides, userID string) store.SeriesLimiterFactory {
762-
return func(failedCounter prometheus.Counter) store.SeriesLimiter {
763-
// Since limit overrides could be live reloaded, we have to get the current user's limit
764-
// each time a new limiter is instantiated.
765-
return &limiter{
766-
limiter: store.NewLimiter(uint64(limits.MaxFetchedSeriesPerQuery(userID)), failedCounter),
767-
}
768-
}
769-
}
770-
771-
func newBytesLimiterFactory(limits *validation.Overrides, userID string) store.BytesLimiterFactory {
772-
return func(failedCounter prometheus.Counter) store.BytesLimiter {
773-
// Since limit overrides could be live reloaded, we have to get the current user's limit
774-
// each time a new limiter is instantiated.
775-
return &limiter{
776-
limiter: store.NewLimiter(uint64(limits.MaxDownloadedBytesPerRequest(userID)), failedCounter),
777-
}
778-
}
779-
}

0 commit comments

Comments
 (0)