Skip to content

Commit 6e9eb1d

Browse files
authored
Merge pull request #8938 from bhandras/etcd-leader-election-fixups
multi: check leader status with our health checker to correctly shut down LND if network partitions
2 parents 04dde98 + 037161e commit 6e9eb1d

File tree

18 files changed

+416
-33
lines changed

18 files changed

+416
-33
lines changed

cluster/etcd_elector.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,27 @@ func (e *etcdLeaderElector) Leader(ctx context.Context) (string, error) {
9999
return "", err
100100
}
101101

102+
if resp == nil || len(resp.Kvs) == 0 {
103+
return "", nil
104+
}
105+
102106
return string(resp.Kvs[0].Value), nil
103107
}
104108

109+
// IsLeader returns true if the caller is the leader.
110+
func (e *etcdLeaderElector) IsLeader(ctx context.Context) (bool, error) {
111+
resp, err := e.election.Leader(ctx)
112+
if err != nil {
113+
return false, err
114+
}
115+
116+
if resp == nil || len(resp.Kvs) == 0 {
117+
return false, nil
118+
}
119+
120+
return string(resp.Kvs[0].Value) == e.id, nil
121+
}
122+
105123
// Campaign will start a new leader election campaign. Campaign will block until
106124
// the elector context is canceled or the caller is elected as the leader.
107125
func (e *etcdLeaderElector) Campaign(ctx context.Context) error {
@@ -110,6 +128,6 @@ func (e *etcdLeaderElector) Campaign(ctx context.Context) error {
110128

111129
// Resign resigns the leader role allowing other election members to take
112130
// the place.
113-
func (e *etcdLeaderElector) Resign() error {
114-
return e.election.Resign(context.Background())
131+
func (e *etcdLeaderElector) Resign(ctx context.Context) error {
132+
return e.election.Resign(ctx)
115133
}

cluster/etcd_elector_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,12 @@ func TestEtcdElector(t *testing.T) {
8787
tmp := <-ch
8888
first, err := tmp.Leader(ctxb)
8989
require.NoError(t, err)
90-
require.NoError(t, tmp.Resign())
90+
require.NoError(t, tmp.Resign(ctxb))
9191

9292
tmp = <-ch
9393
second, err := tmp.Leader(ctxb)
9494
require.NoError(t, err)
95-
require.NoError(t, tmp.Resign())
95+
require.NoError(t, tmp.Resign(ctxb))
9696

9797
require.Contains(t, []string{id1, id2}, first)
9898
require.Contains(t, []string{id1, id2}, second)

cluster/interface.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,11 @@ type LeaderElector interface {
1919

2020
// Resign resigns from the leader role, allowing other election members
2121
// to take on leadership.
22-
Resign() error
22+
Resign(ctx context.Context) error
2323

2424
// Leader returns the leader value for the current election.
2525
Leader(ctx context.Context) (string, error)
26+
27+
// IsLeader returns true if the caller is the leader.
28+
IsLeader(ctx context.Context) (bool, error)
2629
}

config.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,17 @@ const (
169169
defaultRSBackoff = time.Second * 30
170170
defaultRSAttempts = 1
171171

172+
// Set defaults for a health check which ensures that the leader
173+
// election is functioning correctly. Although this check is off by
174+
// default (as etcd leader election is only used in a clustered setup),
175+
// we still set the default values so that the health check can be
176+
// easily enabled with sane defaults. Note that by default we only run
177+
// this check once, as it is critical for the node's operation.
178+
defaultLeaderCheckInterval = time.Minute
179+
defaultLeaderCheckTimeout = time.Second * 5
180+
defaultLeaderCheckBackoff = time.Second * 5
181+
defaultLeaderCheckAttempts = 1
182+
172183
// defaultRemoteMaxHtlcs specifies the default limit for maximum
173184
// concurrent HTLCs the remote party may add to commitment transactions.
174185
// This value can be overridden with --default-remote-max-htlcs.
@@ -672,6 +683,12 @@ func DefaultConfig() Config {
672683
Attempts: defaultRSAttempts,
673684
Backoff: defaultRSBackoff,
674685
},
686+
LeaderCheck: &lncfg.CheckConfig{
687+
Interval: defaultLeaderCheckInterval,
688+
Timeout: defaultLeaderCheckTimeout,
689+
Attempts: defaultLeaderCheckAttempts,
690+
Backoff: defaultLeaderCheckBackoff,
691+
},
675692
},
676693
Gossip: &lncfg.Gossip{
677694
MaxChannelUpdateBurst: discovery.DefaultMaxChannelUpdateBurst,

docs/release-notes/release-notes-0.18.3.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,10 @@ commitment when the channel was force closed.
150150
* [Fixed](https://github.com/lightningnetwork/lnd/pull/8854) pagination issues
151151
in SQL invoicedb queries.
152152

153+
* [Check](https://github.com/lightningnetwork/lnd/pull/8938) leader status with
154+
our health checker to correctly shut down LND if network partitioning occurs
155+
towards the etcd cluster.
156+
153157
## Code Health
154158

155159
* [Move graph building and

go.mod

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,12 @@ replace google.golang.org/protobuf => github.com/lightninglabs/protobuf-go-hex-d
207207
// Temporary replace until the next version of sqldb is tagged.
208208
replace github.com/lightningnetwork/lnd/sqldb => ./sqldb
209209

210+
// Temporary replace until the next version of healthcheck is tagged.
211+
replace github.com/lightningnetwork/lnd/healthcheck => ./healthcheck
212+
213+
// Temporary replace until the next version of kvdb is tagged.
214+
replace github.com/lightningnetwork/lnd/kvdb => ./kvdb
215+
210216
// If you change this please also update .github/pull_request_template.md and
211217
// docs/INSTALL.md.
212218
go 1.21.4

go.sum

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -452,10 +452,6 @@ github.com/lightningnetwork/lnd/clock v1.1.1 h1:OfR3/zcJd2RhH0RU+zX/77c0ZiOnIMsD
452452
github.com/lightningnetwork/lnd/clock v1.1.1/go.mod h1:mGnAhPyjYZQJmebS7aevElXKTFDuO+uNFFfMXK1W8xQ=
453453
github.com/lightningnetwork/lnd/fn v1.2.0 h1:YTb2m8NN5ZiJAskHeBZAmR1AiPY8SXziIYPAX1VI/ZM=
454454
github.com/lightningnetwork/lnd/fn v1.2.0/go.mod h1:SyFohpVrARPKH3XVAJZlXdVe+IwMYc4OMAvrDY32kw0=
455-
github.com/lightningnetwork/lnd/healthcheck v1.2.4 h1:lLPLac+p/TllByxGSlkCwkJlkddqMP5UCoawCj3mgFQ=
456-
github.com/lightningnetwork/lnd/healthcheck v1.2.4/go.mod h1:G7Tst2tVvWo7cx6mSBEToQC5L1XOGxzZTPB29g9Rv2I=
457-
github.com/lightningnetwork/lnd/kvdb v1.4.8 h1:xH0a5Vi1yrcZ5BEeF2ba3vlKBRxrL9uYXlWTjOjbNTY=
458-
github.com/lightningnetwork/lnd/kvdb v1.4.8/go.mod h1:J2diNABOoII9UrMnxXS5w7vZwP7CA1CStrl8MnIrb3A=
459455
github.com/lightningnetwork/lnd/queue v1.1.1 h1:99ovBlpM9B0FRCGYJo6RSFDlt8/vOkQQZznVb18iNMI=
460456
github.com/lightningnetwork/lnd/queue v1.1.1/go.mod h1:7A6nC1Qrm32FHuhx/mi1cieAiBZo5O6l8IBIoQxvkz4=
461457
github.com/lightningnetwork/lnd/ticker v1.1.1 h1:J/b6N2hibFtC7JLV77ULQp++QLtCwT6ijJlbdiZFbSM=

healthcheck/healthcheck.go

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -234,14 +234,13 @@ func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
234234
// the max attempts are reached. In that case we will
235235
// stop the ticker and quit.
236236
if o.retryCheck(quit, shutdown) {
237-
log.Debugf("Health check: max attempts " +
238-
"failed, monitor exiting")
237+
o.Debugf("max attempts failed, monitor exiting")
239238
return
240239
}
241240

242241
// Exit if we receive the instruction to shutdown.
243242
case <-quit:
244-
log.Debug("Health check: monitor quit")
243+
o.Debugf("monitor quit")
245244
return
246245
}
247246
}
@@ -270,7 +269,7 @@ func (o *Observation) retryCheck(quit chan struct{},
270269
// so we'll invoke our success callback if defined and
271270
// then exit.
272271
if err == nil {
273-
log.Debug("invoking success callback")
272+
o.Debugf("invoking success callback")
274273

275274
// Invoke the success callback.
276275
o.OnSuccess()
@@ -283,25 +282,26 @@ func (o *Observation) retryCheck(quit chan struct{},
283282
"%v", o, o.Timeout)
284283

285284
case <-quit:
286-
log.Debug("Health check: monitor quit")
285+
o.Debugf("monitor quit")
287286
return false
288287
}
289288

290289
// If we have reached our allowed number of attempts, this
291290
// check has failed so we'll fire the on failure callback
292291
// and request shutdown.
293292
if count == o.Attempts {
294-
log.Debug("invoking failure callback")
293+
o.Debugf("invoking failure callback")
295294

296295
o.OnFailure()
297296

298-
shutdown("Health check: %v failed after %v "+
299-
"calls", o, o.Attempts)
297+
shutdown("Health check: %v failed after %v calls", o,
298+
o.Attempts)
299+
300300
return true
301301
}
302302

303-
log.Infof("Health check: %v, call: %v failed with: %v, "+
304-
"backing off for: %v", o, count, err, o.Backoff)
303+
o.Infof("failed with: %v, attempts: %v backing off for: %v",
304+
err, count, o.Backoff)
305305

306306
// If we are still within the number of calls allowed for this
307307
// check, we wait for our back off period to elapse, or exit if
@@ -310,10 +310,22 @@ func (o *Observation) retryCheck(quit chan struct{},
310310
case <-time.After(o.Backoff):
311311

312312
case <-quit:
313-
log.Debug("Health check: monitor quit")
313+
o.Debugf("monitor quit")
314314
return false
315315
}
316316
}
317317

318318
return false
319319
}
320+
321+
// Infof logs an info message for an observation prefixed with the health check
322+
// name.
323+
func (o *Observation) Infof(format string, params ...interface{}) {
324+
log.Debugf(fmt.Sprintf("Health check: %v ", o)+format, params...)
325+
}
326+
327+
// Debugf logs a debug message for an observation prefixed with the health check
328+
// name.
329+
func (o *Observation) Debugf(format string, params ...interface{}) {
330+
log.Debugf(fmt.Sprintf("Health check: %v ", o)+format, params...)
331+
}

itest/list_on_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ var allTestCases = []*lntest.TestCase{
233233
Name: "etcd failover",
234234
TestFunc: testEtcdFailover,
235235
},
236+
{
237+
Name: "leader health check",
238+
TestFunc: testLeaderHealthCheck,
239+
},
236240
{
237241
Name: "hold invoice force close",
238242
TestFunc: testHoldInvoiceForceClose,

0 commit comments

Comments
 (0)