@@ -8,9 +8,11 @@ package livenesspb
8
8
import (
9
9
"context"
10
10
"fmt"
11
+ "sync/atomic"
11
12
"time"
12
13
13
14
"github.com/cockroachdb/cockroach/pkg/roachpb"
15
+ "github.com/cockroachdb/cockroach/pkg/rpc"
14
16
"github.com/cockroachdb/cockroach/pkg/util/hlc"
15
17
"github.com/cockroachdb/errors"
16
18
"github.com/cockroachdb/redact"
@@ -159,6 +161,61 @@ type IsLiveMapEntry struct {
159
161
// IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry.
160
162
type IsLiveMap map [roachpb.NodeID ]IsLiveMapEntry
161
163
164
+ // NodeConnectionHealth represents the minimal interface needed for checking if
165
+ // a node RPC connection is alive.
166
+ type NodeConnectionHealth interface {
167
+ // ConnHealth returns nil if we have an open connection of the request
168
+ // class that works successfully. Otherwise, it returns an error.
169
+ ConnHealth (nodeID roachpb.NodeID , class rpc.ConnectionClass ) error
170
+ }
171
+
172
+ // NodeConnectionStatus is a lightweight wrapper around the
173
+ // NodeConnectionHealth, where calculating the connection status is done lazily
174
+ // upon the first call to IsConnected(). This is useful as a member in structs
175
+ // where we sometimes want to calculate the connection status, and sometimes we
176
+ // don't.
177
+ type NodeConnectionStatus struct {
178
+ nodeConnectionHealth NodeConnectionHealth
179
+ nodeID roachpb.NodeID
180
+ // calculatedConnected specifies whether we did calculate the connected field
181
+ // or not. If we haven't, we can't rely on the value of connected.
182
+ calculatedConnected atomic.Bool
183
+ // connected is an atomic boolean that tracks whether we are connected to the node.
184
+ connected atomic.Bool
185
+ }
186
+
187
+ func NewNodeConnectionStatus (
188
+ nodeID roachpb.NodeID , nodeConnectionHealth NodeConnectionHealth ,
189
+ ) * NodeConnectionStatus {
190
+ ncs := & NodeConnectionStatus {
191
+ nodeConnectionHealth : nodeConnectionHealth ,
192
+ nodeID : nodeID ,
193
+ }
194
+ return ncs
195
+ }
196
+
197
+ // SetIsConnected changes the connection status of the node.
198
+ func (ncs * NodeConnectionStatus ) SetIsConnected (connected bool ) {
199
+ ncs .connected .Store (connected )
200
+ ncs .calculatedConnected .Store (true )
201
+ }
202
+
203
+ // IsConnected checks if we are connected to the supplied nodeID. It only
204
+ // performs the calculation the first time. Future calls will use the cached
205
+ // version.
206
+ func (ncs * NodeConnectionStatus ) IsConnected () bool {
207
+ if ! ncs .calculatedConnected .Load () {
208
+ // Calculate the connection status if we haven't done that before.
209
+ // Some tests will set the nodeDialer to nil, so we need to check for that.
210
+ connected := ncs .nodeConnectionHealth == nil ||
211
+ ncs .nodeConnectionHealth .ConnHealth (ncs .nodeID , rpc .SystemClass ) == nil
212
+ ncs .SetIsConnected (connected )
213
+ ncs .calculatedConnected .Store (true )
214
+ }
215
+
216
+ return ncs .connected .Load ()
217
+ }
218
+
162
219
// NodeVitality should be used any place other than epoch leases where it is
163
220
// necessary to determine if a node is currently alive and what its health is.
164
221
// Aliveness and deadness are concepts that refer to our best guess of the
@@ -173,8 +230,9 @@ type NodeVitality struct {
173
230
draining bool
174
231
// membership is whether the node is active or in a state of decommissioning.
175
232
membership MembershipStatus
176
- // connected is whether we are currently directly connect to this node.
177
- connected bool
233
+ // nodeConnectionStatus calculates whether we are currently directly connect
234
+ // to this node.
235
+ nodeConnectionStatus * NodeConnectionStatus
178
236
179
237
// When the record is created. Records are not held for long, but they should
180
238
// always give consistent results when asked.
@@ -285,7 +343,7 @@ func (nv NodeVitality) IsLive(usage VitalityUsage) bool {
285
343
return nv .isAliveAndConnected ()
286
344
}
287
345
case NetworkMap :
288
- return nv .connected
346
+ return nv .nodeConnectionStatus . IsConnected ()
289
347
case LossOfQuorum :
290
348
return nv .isAlive ()
291
349
case ReplicaGCQueue :
@@ -315,7 +373,7 @@ func (nv NodeVitality) isAvailableNotDraining() bool {
315
373
}
316
374
317
375
func (nv NodeVitality ) isAliveAndConnected () bool {
318
- return nv .isAvailableNotDraining () && nv .connected
376
+ return nv .isAvailableNotDraining () && nv .nodeConnectionStatus . IsConnected ()
319
377
}
320
378
321
379
// isAliveEpoch is used for epoch leases. It is similar to isAlive, but doesn't
@@ -535,7 +593,7 @@ func (l Liveness) CreateNodeVitality(
535
593
now hlc.Timestamp ,
536
594
descUpdateTime hlc.Timestamp ,
537
595
descUnavailableTime hlc.Timestamp ,
538
- connected bool ,
596
+ connectionStatus * NodeConnectionStatus ,
539
597
timeUntilNodeDead time.Duration ,
540
598
timeAfterNodeSuspect time.Duration ,
541
599
) NodeVitality {
@@ -546,7 +604,7 @@ func (l Liveness) CreateNodeVitality(
546
604
nodeID : l .NodeID ,
547
605
draining : l .Draining ,
548
606
membership : l .Membership ,
549
- connected : connected ,
607
+ nodeConnectionStatus : connectionStatus ,
550
608
now : now ,
551
609
descUpdateTime : descUpdateTime ,
552
610
descUnavailableTime : descUnavailableTime ,
0 commit comments