Skip to content

Commit 4405f9c

Browse files
alanprotpracuccibboreham
authored
Fix Ingesters unable to re-join the cluster then unregister_on_shutdown=false + -ingester.heartbeat-period=0 (#4366)
* Fix problem when ingester heartbeat is disabled and unregister_on_shutdown=false Signed-off-by: Alan Protasio <[email protected]> * Update CHANGELOG.md Co-authored-by: Marco Pracucci <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Update pkg/ring/lifecycler.go Co-authored-by: Bryan Boreham <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Update pkg/ring/lifecycler_test.go Co-authored-by: Bryan Boreham <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Address comments Signed-off-by: Alan Protasio <[email protected]> * Addressing comments on test Signed-off-by: Alan Protasio <[email protected]> Co-authored-by: Marco Pracucci <[email protected]> Co-authored-by: Bryan Boreham <[email protected]>
1 parent a4bf103 commit 4405f9c

File tree

3 files changed

+114
-1
lines changed

3 files changed

+114
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
* [BUGFIX] HA Tracker: when cleaning up obsolete elected replicas from KV store, tracker didn't update number of cluster per user correctly. #4336
3131
* [BUGFIX] Ruler: fixed counting of PromQL evaluation errors as user-errors when updating `cortex_ruler_queries_failed_total`. #4335
3232
* [BUGFIX] Ingester: When using block storage, prevent any reads or writes while the ingester is stopping. This will prevent accessing TSDB blocks once they have been already closed. #4304
33+
* [BUGFIX] Ingester: fixed ingester stuck on start up (LEAVING ring state) when `-ingester.heartbeat-period=0` and `-ingester.unregister-on-shutdown=false`. #4366
3334

3435
## 1.10.0-rc.0 / 2021-06-28
3536

pkg/ring/lifecycler.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,8 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
576576
return ringDesc, true, nil
577577
}
578578

579-
// If the ingester failed to clean it's ring entry up in can leave it's state in LEAVING.
579+
// If the ingester failed to clean its ring entry up in can leave its state in LEAVING
580+
// OR unregister_on_shutdown=false
580581
// Move it into ACTIVE to ensure the ingester joins the ring.
581582
if instanceDesc.State == LEAVING && len(instanceDesc.Tokens) == i.cfg.NumTokens {
582583
instanceDesc.State = ACTIVE
@@ -588,6 +589,17 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
588589
i.setTokens(tokens)
589590

590591
level.Info(log.Logger).Log("msg", "existing entry found in ring", "state", i.GetState(), "tokens", len(tokens), "ring", i.RingName)
592+
593+
// Update the ring if the instance has been changed and the heartbeat is disabled.
594+
// We dont need to update KV here when heartbeat is enabled as this info will eventually be update on KV
595+
// on the next heartbeat
596+
if i.cfg.HeartbeatPeriod == 0 && !instanceDesc.Equal(ringDesc.Ingesters[i.ID]) {
597+
// Update timestamp to give gossiping client a chance register ring change.
598+
instanceDesc.Timestamp = time.Now().Unix()
599+
ringDesc.Ingesters[i.ID] = instanceDesc
600+
return ringDesc, true, nil
601+
}
602+
591603
// we haven't modified the ring, don't try to store it.
592604
return nil, true, nil
593605
})

pkg/ring/lifecycler_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,106 @@ type noopFlushTransferer struct {
321321
func (f *noopFlushTransferer) Flush() {}
322322
func (f *noopFlushTransferer) TransferOut(ctx context.Context) error { return nil }
323323

324+
func TestRestartIngester_DisabledHeartbeat_unregister_on_shutdown_false(t *testing.T) {
325+
var ringConfig Config
326+
flagext.DefaultValues(&ringConfig)
327+
ringConfig.KVStore.Mock = consul.NewInMemoryClient(GetCodec())
328+
329+
r, err := New(ringConfig, "ingester", IngesterRingKey, nil)
330+
require.NoError(t, err)
331+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), r))
332+
333+
// poll function waits for a condition and returning actual state of the ingesters after the condition succeed.
334+
poll := func(condition func(*Desc) bool) map[string]InstanceDesc {
335+
var ingesters map[string]InstanceDesc
336+
test.Poll(t, 5*time.Second, true, func() interface{} {
337+
d, err := r.KVClient.Get(context.Background(), IngesterRingKey)
338+
require.NoError(t, err)
339+
340+
desc, ok := d.(*Desc)
341+
342+
if ok {
343+
ingesters = desc.Ingesters
344+
}
345+
return ok && condition(desc)
346+
})
347+
348+
return ingesters
349+
}
350+
351+
// Starts Ingester and wait it to became active
352+
startIngesterAndWaitActive := func(ingId string) *Lifecycler {
353+
lifecyclerConfig := testLifecyclerConfig(ringConfig, ingId)
354+
// Disabling heartBeat and unregister_on_shutdown
355+
lifecyclerConfig.UnregisterOnShutdown = false
356+
lifecyclerConfig.HeartbeatPeriod = 0
357+
lifecycler, err := NewLifecycler(lifecyclerConfig, &noopFlushTransferer{}, "lifecycler", IngesterRingKey, true, nil)
358+
require.NoError(t, err)
359+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), lifecycler))
360+
poll(func(desc *Desc) bool {
361+
return desc.Ingesters[ingId].State == ACTIVE
362+
})
363+
return lifecycler
364+
}
365+
366+
// We are going to create 2 fake ingester with disabled heart beat and `unregister_on_shutdown=false` then
367+
// test if the ingester 2 became active after:
368+
// * Clean Shutdown (LEAVING after shutdown)
369+
// * Crashes while in the PENDING or JOINING state
370+
l1 := startIngesterAndWaitActive("ing1")
371+
defer services.StopAndAwaitTerminated(context.Background(), l1) //nolint:errcheck
372+
373+
l2 := startIngesterAndWaitActive("ing2")
374+
375+
ingesters := poll(func(desc *Desc) bool {
376+
return len(desc.Ingesters) == 2 && desc.Ingesters["ing1"].State == ACTIVE && desc.Ingesters["ing2"].State == ACTIVE
377+
})
378+
379+
// Both Ingester should be active and running
380+
assert.Equal(t, ACTIVE, ingesters["ing1"].State)
381+
assert.Equal(t, ACTIVE, ingesters["ing2"].State)
382+
383+
// Stop One ingester gracefully should leave it on LEAVING STATE on the ring
384+
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2))
385+
386+
ingesters = poll(func(desc *Desc) bool {
387+
return len(desc.Ingesters) == 2 && desc.Ingesters["ing2"].State == LEAVING
388+
})
389+
assert.Equal(t, LEAVING, ingesters["ing2"].State)
390+
391+
// Start Ingester2 again - Should flip back to ACTIVE in the ring
392+
l2 = startIngesterAndWaitActive("ing2")
393+
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2))
394+
395+
// Simulate ingester2 crash on startup and left the ring with JOINING state
396+
err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (out interface{}, retry bool, err error) {
397+
desc, ok := in.(*Desc)
398+
require.Equal(t, true, ok)
399+
ingester2Desc := desc.Ingesters["ing2"]
400+
ingester2Desc.State = JOINING
401+
desc.Ingesters["ing2"] = ingester2Desc
402+
return desc, true, nil
403+
})
404+
require.NoError(t, err)
405+
406+
l2 = startIngesterAndWaitActive("ing2")
407+
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2))
408+
409+
// Simulate ingester2 crash on startup and left the ring with PENDING state
410+
err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (out interface{}, retry bool, err error) {
411+
desc, ok := in.(*Desc)
412+
require.Equal(t, true, ok)
413+
ingester2Desc := desc.Ingesters["ing2"]
414+
ingester2Desc.State = PENDING
415+
desc.Ingesters["ing2"] = ingester2Desc
416+
return desc, true, nil
417+
})
418+
require.NoError(t, err)
419+
420+
l2 = startIngesterAndWaitActive("ing2")
421+
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2))
422+
}
423+
324424
func TestTokensOnDisk(t *testing.T) {
325425
var ringConfig Config
326426
flagext.DefaultValues(&ringConfig)

0 commit comments

Comments
 (0)