@@ -33,12 +33,12 @@ import (
3333 "github.com/cortexproject/cortex/pkg/ring"
3434 "github.com/cortexproject/cortex/pkg/ring/client"
3535 "github.com/cortexproject/cortex/pkg/ring/kv"
36- "github.com/cortexproject/cortex/pkg/tenant"
3736 "github.com/cortexproject/cortex/pkg/util"
3837 "github.com/cortexproject/cortex/pkg/util/concurrency"
3938 "github.com/cortexproject/cortex/pkg/util/flagext"
4039 util_log "github.com/cortexproject/cortex/pkg/util/log"
4140 "github.com/cortexproject/cortex/pkg/util/services"
41+ "github.com/cortexproject/cortex/pkg/util/users"
4242)
4343
4444const (
@@ -91,6 +91,8 @@ type MultitenantAlertmanagerConfig struct {
9191
9292 EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
9393 DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
94+
95+ CleanUpInterval time.Duration `yaml:"-"`
9496}
9597
9698type ClusterConfig struct {
@@ -284,14 +286,16 @@ type MultitenantAlertmanager struct {
284286
285287 limits Limits
286288
287- allowedTenants * util .AllowedTenants
289+ allowedTenants * users .AllowedTenants
288290
289291 registry prometheus.Registerer
290292 ringCheckErrors prometheus.Counter
291293 tenantsOwned prometheus.Gauge
292294 tenantsDiscovered prometheus.Gauge
293295 syncTotal * prometheus.CounterVec
294296 syncFailures * prometheus.CounterVec
297+
298+ userIndexUpdater * users.UserIndexUpdater
295299}
296300
297301// NewMultitenantAlertmanager creates a new MultitenantAlertmanager.
@@ -374,10 +378,11 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
374378 multitenantMetrics : newMultitenantAlertmanagerMetrics (registerer ),
375379 peer : peer ,
376380 store : store ,
381+ userIndexUpdater : store .GetUserIndexUpdater (),
377382 logger : log .With (logger , "component" , "MultiTenantAlertmanager" ),
378383 registry : registerer ,
379384 limits : limits ,
380- allowedTenants : util .NewAllowedTenants (cfg .EnabledTenants , cfg .DisabledTenants ),
385+ allowedTenants : users .NewAllowedTenants (cfg .EnabledTenants , cfg .DisabledTenants ),
381386 ringCheckErrors : promauto .With (registerer ).NewCounter (prometheus.CounterOpts {
382387 Name : "cortex_alertmanager_ring_check_errors_total" ,
383388 Help : "Number of errors that have occurred when checking the ring for ownership." ,
@@ -667,6 +672,10 @@ func (am *MultitenantAlertmanager) run(ctx context.Context) error {
667672 ringTickerChan = ringTicker .C
668673 }
669674
675+ if am .cfg .ShardingEnabled && am .userIndexUpdater != nil {
676+ go am .userIndexUpdateLoop (ctx )
677+ }
678+
670679 for {
671680 select {
672681 case <- ctx .Done ():
@@ -693,6 +702,32 @@ func (am *MultitenantAlertmanager) run(ctx context.Context) error {
693702 }
694703}
695704
705+ func (am * MultitenantAlertmanager ) userIndexUpdateLoop (ctx context.Context ) {
706+ // Hardcode ID to check which alertmanager owns updating user index.
707+ userID := users .UserIndexCompressedFilename
708+ // Align with clean up interval.
709+ ticker := time .NewTicker (util .DurationWithJitter (am .userIndexUpdater .GetCleanUpInterval (), 0.1 ))
710+ defer ticker .Stop ()
711+
712+ for {
713+ select {
714+ case <- ctx .Done ():
715+ level .Error (am .logger ).Log ("msg" , "context timeout, exit user index update loop" , "err" , ctx .Err ())
716+ return
717+ case <- ticker .C :
718+ owned := am .isUserOwned (userID )
719+ if ! owned {
720+ continue
721+ }
722+ if err := am .userIndexUpdater .UpdateUserIndex (ctx ); err != nil {
723+ level .Error (am .logger ).Log ("msg" , "failed to update user index" , "err" , err )
724+ // Wait for next interval. Worst case, the user index scanner will fallback to list strategy.
725+ continue
726+ }
727+ }
728+ }
729+ }
730+
696731func (am * MultitenantAlertmanager ) loadAndSyncConfigs (ctx context.Context , syncReason string ) error {
697732 level .Info (am .logger ).Log ("msg" , "synchronizing alertmanager configs for users" )
698733 am .syncTotal .WithLabelValues (syncReason ).Inc ()
@@ -795,7 +830,7 @@ func (am *MultitenantAlertmanager) isUserOwned(userID string) bool {
795830 return true
796831 }
797832
798- alertmanagers , err := am .ring .Get (shardByUser (userID ), SyncRingOp , nil , nil , nil )
833+ alertmanagers , err := am .ring .Get (users . ShardByUser (userID ), SyncRingOp , nil , nil , nil )
799834 if err != nil {
800835 am .ringCheckErrors .Inc ()
801836 level .Error (am .logger ).Log ("msg" , "failed to load alertmanager configuration" , "user" , userID , "err" , err )
@@ -1005,7 +1040,7 @@ func (am *MultitenantAlertmanager) GetPositionForUser(userID string) int {
10051040 return 0
10061041 }
10071042
1008- set , err := am .ring .Get (shardByUser (userID ), RingOp , nil , nil , nil )
1043+ set , err := am .ring .Get (users . ShardByUser (userID ), RingOp , nil , nil , nil )
10091044 if err != nil {
10101045 level .Error (am .logger ).Log ("msg" , "unable to read the ring while trying to determine the alertmanager position" , "err" , err )
10111046 // If we're unable to determine the position, we don't want a tenant to miss out on the notification - instead,
@@ -1048,7 +1083,7 @@ func (am *MultitenantAlertmanager) HandleRequest(ctx context.Context, in *httpgr
10481083
10491084// serveRequest serves the Alertmanager's web UI and API.
10501085func (am * MultitenantAlertmanager ) serveRequest (w http.ResponseWriter , req * http.Request ) {
1051- userID , err := tenant .TenantID (req .Context ())
1086+ userID , err := users .TenantID (req .Context ())
10521087 if err != nil {
10531088 http .Error (w , err .Error (), http .StatusUnauthorized )
10541089 return
@@ -1106,7 +1141,7 @@ func (am *MultitenantAlertmanager) ReplicateStateForUser(ctx context.Context, us
11061141 level .Debug (am .logger ).Log ("msg" , "message received for replication" , "user" , userID , "key" , part .Key )
11071142
11081143 selfAddress := am .ringLifecycler .GetInstanceAddr ()
1109- err := ring .DoBatch (ctx , RingOp , am .ring , nil , []uint32 {shardByUser (userID )}, func (desc ring.InstanceDesc , _ []int ) error {
1144+ err := ring .DoBatch (ctx , RingOp , am .ring , nil , []uint32 {users . ShardByUser (userID )}, func (desc ring.InstanceDesc , _ []int ) error {
11101145 if desc .GetAddr () == selfAddress {
11111146 return nil
11121147 }
@@ -1137,7 +1172,7 @@ func (am *MultitenantAlertmanager) ReplicateStateForUser(ctx context.Context, us
11371172// state from all replicas, but will consider it a success if state is obtained from at least one replica.
11381173func (am * MultitenantAlertmanager ) ReadFullStateForUser (ctx context.Context , userID string ) ([]* clusterpb.FullState , error ) {
11391174 // Only get the set of replicas which contain the specified user.
1140- key := shardByUser (userID )
1175+ key := users . ShardByUser (userID )
11411176 replicationSet , err := am .ring .Get (key , RingOp , nil , nil , nil )
11421177 if err != nil {
11431178 return nil , err
@@ -1197,7 +1232,7 @@ func (am *MultitenantAlertmanager) ReadFullStateForUser(ctx context.Context, use
11971232
11981233// UpdateState implements the Alertmanager service.
11991234func (am * MultitenantAlertmanager ) UpdateState (ctx context.Context , part * clusterpb.Part ) (* alertmanagerpb.UpdateStateResponse , error ) {
1200- userID , err := tenant .TenantID (ctx )
1235+ userID , err := users .TenantID (ctx )
12011236 if err != nil {
12021237 return nil , err
12031238 }
@@ -1307,7 +1342,7 @@ func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string {
13071342
13081343// UpdateState implements the Alertmanager service.
13091344func (am * MultitenantAlertmanager ) ReadState (ctx context.Context , req * alertmanagerpb.ReadStateRequest ) (* alertmanagerpb.ReadStateResponse , error ) {
1310- userID , err := tenant .TenantID (ctx )
1345+ userID , err := users .TenantID (ctx )
13111346 if err != nil {
13121347 return nil , err
13131348 }
0 commit comments