Skip to content

Commit 8016956

Browse files
authored
Fix data race in TestRecoverAlertsPostOutage (#6750)
* Fix data race in TestRecoverAlertsPostOutage Previously, TestRecoverAlertsPostOutage had a data race: ``` ================== WARNING: DATA RACE Read at 0x00c025bbfc30 by goroutine 56074: github.com/prometheus/prometheus/rules.(*Group).Eval.func1() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:569 +0xc8a github.com/prometheus/prometheus/rules.(*Group).Eval() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:666 +0x4e5 github.com/prometheus/prometheus/rules.DefaultEvalIterationFunc() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:81 +0x1a5 github.com/cortexproject/cortex/pkg/ruler.ruleGroupIterationFunc() /Users/danielsabsay/git/cortex/pkg/ruler/manager.go:272 +0x4a9 github.com/prometheus/prometheus/rules.(*Group).run() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:256 +0x6a7 github.com/prometheus/prometheus/rules.(*Manager).Update.func1() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:258 +0x11b github.com/prometheus/prometheus/rules.(*Manager).Update.gowrap2() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:259 +0x41 Previous write at 0x00c025bbfc30 by goroutine 48: github.com/prometheus/prometheus/rules.(*Group).Eval.func1.2() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:580 +0x35e runtime.deferreturn() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/runtime/panic.go:605 +0x5d github.com/prometheus/prometheus/rules.(*Group).Eval() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:666 +0x4e5 github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_test.go:2823 +0x2124 github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage_check_races() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_race_test.go:9 +0x30 testing.tRunner() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:1792 +0x225 testing.(*T).Run.gowrap1() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:1851 +0x44 Goroutine 56074 (running) created at: github.com/prometheus/prometheus/rules.(*Manager).Update() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:248 +0x69c github.com/cortexproject/cortex/pkg/ruler.(*DefaultMultiTenantManager).syncRulesToManager() /Users/danielsabsay/git/cortex/pkg/ruler/manager.go:217 +0xa5c github.com/cortexproject/cortex/pkg/ruler.(*DefaultMultiTenantManager).SyncRuleGroups() /Users/danielsabsay/git/cortex/pkg/ruler/manager.go:140 +0x1da github.com/cortexproject/cortex/pkg/ruler.(*Ruler).syncRules() /Users/danielsabsay/git/cortex/pkg/ruler/ruler.go:728 +0x71b github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_test.go:2781 +0x130b github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage_check_races() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_race_test.go:9 +0x30 testing.tRunner() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:1792 +0x225 testing.(*T).Run.gowrap1() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:1851 +0x44 Goroutine 48 (running) created at: testing.(*T).Run() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:1851 +0x8f2 testing.runTests.func1() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:2279 +0x85 testing.tRunner() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:1792 +0x225 testing.runTests() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:2277 +0x96c testing.(*M).Run() /Users/danielsabsay/go/pkg/mod/golang.org/[email protected]/src/testing/testing.go:2142 +0xeea main.main() _testmain.go:167 +0x164 ================== ``` As can be seen above, the data race occurs because the rule evaluation is happening from two different goroutines: 1 that is scheduled by the normal ruler manager and 1 that is run by the test itself. The test itself calls Eval() with specific timestamps to test recovery logic. For purposes of this test, we don't want the normal evaluation loop to run at all. This commit injects a no-op GroupEvalIterationFunc into the ruler manager so that the only rule evaluation that happens is run by the test. Signed-off-by: dsabsay <[email protected]> * Fix linting error Signed-off-by: dsabsay <[email protected]> --------- Signed-off-by: dsabsay <[email protected]>
1 parent e6b828f commit 8016956

File tree

2 files changed

+40
-6
lines changed

2 files changed

+40
-6
lines changed

pkg/ruler/manager.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ type DefaultMultiTenantManager struct {
6464
ruleCache map[string][]*promRules.Group
6565
ruleCacheMtx sync.RWMutex
6666
syncRuleMtx sync.Mutex
67+
68+
ruleGroupIterationFunc promRules.GroupEvalIterationFunc
6769
}
6870

6971
func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory ManagerFactory, evalMetrics *RuleEvalMetrics, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
@@ -122,15 +124,25 @@ func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory
122124
Name: "ruler_config_updates_total",
123125
Help: "Total number of config updates triggered by a user",
124126
}, []string{"user"}),
125-
registry: reg,
126-
logger: logger,
127+
registry: reg,
128+
logger: logger,
129+
ruleGroupIterationFunc: defaultRuleGroupIterationFunc,
127130
}
128131
if cfg.RulesBackupEnabled() {
129132
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg)
130133
}
131134
return m, nil
132135
}
133136

137+
func NewDefaultMultiTenantManagerWithIterationFunc(iterFunc promRules.GroupEvalIterationFunc, cfg Config, limits RulesLimits, managerFactory ManagerFactory, evalMetrics *RuleEvalMetrics, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
138+
manager, err := NewDefaultMultiTenantManager(cfg, limits, managerFactory, evalMetrics, reg, logger)
139+
if err != nil {
140+
return nil, err
141+
}
142+
manager.ruleGroupIterationFunc = iterFunc
143+
return manager, nil
144+
}
145+
134146
func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) {
135147
// this is a safety lock to ensure this method is executed sequentially
136148
r.syncRuleMtx.Lock()
@@ -214,7 +226,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
214226
if (rulesUpdated || externalLabelsUpdated) && existing {
215227
r.updateRuleCache(user, manager.RuleGroups())
216228
}
217-
err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, r.cfg.ExternalURL.String(), ruleGroupIterationFunc)
229+
err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, r.cfg.ExternalURL.String(), r.ruleGroupIterationFunc)
218230
r.deleteRuleCache(user)
219231
if err != nil {
220232
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
@@ -257,7 +269,7 @@ func (r *DefaultMultiTenantManager) createRulesManager(user string, ctx context.
257269
return manager
258270
}
259271

260-
func ruleGroupIterationFunc(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) {
272+
func defaultRuleGroupIterationFunc(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) {
261273
logMessage := []interface{}{
262274
"component", "ruler",
263275
"rule_group", g.Name(),

pkg/ruler/ruler_test.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,26 @@ func buildRuler(t *testing.T, rulerConfig Config, querierTestConfig *querier.Tes
340340
return ruler, manager
341341
}
342342

343+
func buildRulerWithIterFunc(t *testing.T, rulerConfig Config, querierTestConfig *querier.TestConfig, store rulestore.RuleStore, rulerAddrMap map[string]*Ruler, ruleGroupIterFunc promRules.GroupEvalIterationFunc) (*Ruler, *DefaultMultiTenantManager) {
344+
engine, queryable, pusher, logger, overrides, reg := testSetup(t, querierTestConfig)
345+
metrics := NewRuleEvalMetrics(rulerConfig, reg)
346+
managerFactory := DefaultTenantManagerFactory(rulerConfig, pusher, queryable, engine, overrides, metrics, reg)
347+
manager, err := NewDefaultMultiTenantManagerWithIterationFunc(ruleGroupIterFunc, rulerConfig, &ruleLimits{}, managerFactory, metrics, reg, log.NewNopLogger())
348+
require.NoError(t, err)
349+
350+
ruler, err := newRuler(
351+
rulerConfig,
352+
manager,
353+
reg,
354+
logger,
355+
store,
356+
overrides,
357+
newMockClientsPool(rulerConfig, logger, reg, rulerAddrMap),
358+
)
359+
require.NoError(t, err)
360+
return ruler, manager
361+
}
362+
343363
func newTestRuler(t *testing.T, rulerConfig Config, store rulestore.RuleStore, querierTestConfig *querier.TestConfig) *Ruler {
344364
ruler, _ := buildRuler(t, rulerConfig, querierTestConfig, store, nil)
345365
require.NoError(t, services.StartAndAwaitRunning(context.Background(), ruler))
@@ -2776,8 +2796,10 @@ func TestRecoverAlertsPostOutage(t *testing.T) {
27762796
querier.UseAlwaysQueryable(newEmptyQueryable()),
27772797
}
27782798

2779-
// create a ruler but don't start it. instead, we'll evaluate the rule groups manually.
2780-
r, _ := buildRuler(t, rulerCfg, &querier.TestConfig{Cfg: querierConfig, Distributor: d, Stores: queryables}, store, nil)
2799+
// Define a no-op GroupEvalIterationFunc to avoid races between the scheduled Eval() execution and the evaluations invoked by this test.
2800+
evalFunc := func(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) {}
2801+
2802+
r, _ := buildRulerWithIterFunc(t, rulerCfg, &querier.TestConfig{Cfg: querierConfig, Distributor: d, Stores: queryables}, store, nil, evalFunc)
27812803
r.syncRules(context.Background(), rulerSyncReasonInitial)
27822804

27832805
// assert initial state of rule group

0 commit comments

Comments
 (0)