Skip to content

Commit fef451b

Browse files
authored
Merge pull request #152455 from iskettaneh/backport25.2-144272
release-25.2: liveness: lazily calculate the node connectivity inside NodeVitality
2 parents 588139e + dabadf5 commit fef451b

File tree

7 files changed

+146
-17
lines changed

7 files changed

+146
-17
lines changed

pkg/kv/kvserver/asim/state/liveness.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,13 @@ func convertNodeStatusToNodeVitality(
8989
l.Expiration = liveTs
9090
l.Draining = true
9191
}
92+
ncs := livenesspb.NewNodeConnectionStatus(nid, nil)
93+
ncs.SetIsConnected(true)
9294
entry := l.CreateNodeVitality(
9395
now, /* now */
9496
hlc.Timestamp{}, /* descUpdateTime */
9597
hlc.Timestamp{}, /* descUnavailableTime */
96-
true, /* connected */
98+
ncs, /* nodeConnectionStatus */
9799
timeUntilNodeDead, /* timeUntilNodeDead */
98100
0, /* timeAfterNodeSuspect */
99101
)

pkg/kv/kvserver/liveness/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ go_library(
1818
"//pkg/kv/kvpb",
1919
"//pkg/kv/kvserver/liveness/livenesspb",
2020
"//pkg/roachpb",
21-
"//pkg/rpc",
2221
"//pkg/rpc/nodedialer",
2322
"//pkg/server/telemetry",
2423
"//pkg/settings",

pkg/kv/kvserver/liveness/cache.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"github.com/cockroachdb/cockroach/pkg/gossip"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
1515
"github.com/cockroachdb/cockroach/pkg/roachpb"
16-
"github.com/cockroachdb/cockroach/pkg/rpc"
1716
"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
1817
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
1918
"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -244,14 +243,13 @@ func (c *Cache) convertToNodeVitality(l livenesspb.Liveness) livenesspb.NodeVita
244243
// even before the first gossip arrives for a store.
245244

246245
// NB: nodeDialer is nil in some tests.
247-
connected := c.nodeDialer == nil || c.nodeDialer.ConnHealth(l.NodeID, rpc.SystemClass) == nil
248246
lastDescUpdate := c.lastDescriptorUpdate(l.NodeID)
249247

250248
return l.CreateNodeVitality(
251249
c.clock.Now(),
252250
lastDescUpdate.lastUpdateTime,
253251
lastDescUpdate.lastUnavailableTime,
254-
connected,
252+
livenesspb.NewNodeConnectionStatus(l.NodeID, c.nodeDialer),
255253
TimeUntilNodeDead.Get(&c.st.SV),
256254
TimeAfterNodeSuspect.Get(&c.st.SV),
257255
)

pkg/kv/kvserver/liveness/client_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,3 +417,41 @@ func TestLivenessRangeGetsPeriodicallyCompacted(t *testing.T) {
417417
return nil
418418
})
419419
}
420+
421+
// TestNodeConnectionStatusIsConnected verifies that NodeConnectionStatus caches
422+
// the connection status after the first IsConnected() call.
423+
func TestNodeConnectionStatusIsConnected(t *testing.T) {
424+
defer leaktest.AfterTest(t)()
425+
defer log.Scope(t).Close(t)
426+
427+
testCases := []struct {
428+
name string
429+
initialConnected bool
430+
}{
431+
{
432+
name: "initially connected",
433+
initialConnected: true,
434+
},
435+
{
436+
name: "initially disconnected",
437+
initialConnected: false,
438+
},
439+
}
440+
441+
for _, tc := range testCases {
442+
t.Run(tc.name, func(t *testing.T) {
443+
// Create a mock node dialer with the initial connection state.
444+
connHealth := livenesspb.NewMockNodeConnectionHealth(tc.initialConnected)
445+
ncs := livenesspb.NewNodeConnectionStatus(roachpb.NodeID(1), connHealth)
446+
447+
// First call should calculate and cache the connection status.
448+
require.Equal(t, tc.initialConnected, ncs.IsConnected())
449+
450+
// Modify the mock to simulate a connection state change.
451+
connHealth.SetConnected(!tc.initialConnected)
452+
453+
// Subsequent calls should use the cached value (still the initial state).
454+
require.Equal(t, tc.initialConnected, ncs.IsConnected())
455+
})
456+
}
457+
}

pkg/kv/kvserver/liveness/livenesspb/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ go_library(
1313
visibility = ["//visibility:public"],
1414
deps = [
1515
"//pkg/roachpb",
16+
"//pkg/rpc",
1617
"//pkg/util/hlc",
1718
"//pkg/util/timeutil",
1819
"@com_github_cockroachdb_errors//:errors",

pkg/kv/kvserver/liveness/livenesspb/liveness.go

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ package livenesspb
88
import (
99
"context"
1010
"fmt"
11+
"sync/atomic"
1112
"time"
1213

1314
"github.com/cockroachdb/cockroach/pkg/roachpb"
15+
"github.com/cockroachdb/cockroach/pkg/rpc"
1416
"github.com/cockroachdb/cockroach/pkg/util/hlc"
1517
"github.com/cockroachdb/errors"
1618
"github.com/cockroachdb/redact"
@@ -159,6 +161,61 @@ type IsLiveMapEntry struct {
159161
// IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry.
160162
type IsLiveMap map[roachpb.NodeID]IsLiveMapEntry
161163

164+
// NodeConnectionHealth represents the minimal interface needed for checking if
165+
// a node RPC connection is alive.
166+
type NodeConnectionHealth interface {
167+
// ConnHealth returns nil if we have an open connection of the request
168+
// class that works successfully. Otherwise, it returns an error.
169+
ConnHealth(nodeID roachpb.NodeID, class rpc.ConnectionClass) error
170+
}
171+
172+
// NodeConnectionStatus is a lightweight wrapper around the
173+
// NodeConnectionHealth, where calculating the connection status is done lazily
174+
// upon the first call to IsConnected(). This is useful as a member in structs
175+
// where we sometimes want to calculate the connection status, and sometimes we
176+
// don't.
177+
type NodeConnectionStatus struct {
178+
nodeConnectionHealth NodeConnectionHealth
179+
nodeID roachpb.NodeID
180+
// calculatedConnected specifies whether we did calculate the connected field
181+
// or not. If we haven't, we can't rely on the value of connected.
182+
calculatedConnected atomic.Bool
183+
// connected is an atomic boolean that tracks whether we are connected to the node.
184+
connected atomic.Bool
185+
}
186+
187+
func NewNodeConnectionStatus(
188+
nodeID roachpb.NodeID, nodeConnectionHealth NodeConnectionHealth,
189+
) *NodeConnectionStatus {
190+
ncs := &NodeConnectionStatus{
191+
nodeConnectionHealth: nodeConnectionHealth,
192+
nodeID: nodeID,
193+
}
194+
return ncs
195+
}
196+
197+
// SetIsConnected changes the connection status of the node.
198+
func (ncs *NodeConnectionStatus) SetIsConnected(connected bool) {
199+
ncs.connected.Store(connected)
200+
ncs.calculatedConnected.Store(true)
201+
}
202+
203+
// IsConnected checks if we are connected to the supplied nodeID. It only
204+
// performs the calculation the first time. Future calls will use the cached
205+
// version.
206+
func (ncs *NodeConnectionStatus) IsConnected() bool {
207+
if !ncs.calculatedConnected.Load() {
208+
// Calculate the connection status if we haven't done that before.
209+
// Some tests will set the nodeDialer to nil, so we need to check for that.
210+
connected := ncs.nodeConnectionHealth == nil ||
211+
ncs.nodeConnectionHealth.ConnHealth(ncs.nodeID, rpc.SystemClass) == nil
212+
ncs.SetIsConnected(connected)
213+
ncs.calculatedConnected.Store(true)
214+
}
215+
216+
return ncs.connected.Load()
217+
}
218+
162219
// NodeVitality should be used any place other than epoch leases where it is
163220
// necessary to determine if a node is currently alive and what its health is.
164221
// Aliveness and deadness are concepts that refer to our best guess of the
@@ -173,8 +230,9 @@ type NodeVitality struct {
173230
draining bool
174231
// membership is whether the node is active or in a state of decommissioning.
175232
membership MembershipStatus
176-
// connected is whether we are currently directly connect to this node.
177-
connected bool
233+
// nodeConnectionStatus calculates whether we are currently directly connect
234+
// to this node.
235+
nodeConnectionStatus *NodeConnectionStatus
178236

179237
// When the record is created. Records are not held for long, but they should
180238
// always give consistent results when asked.
@@ -285,7 +343,7 @@ func (nv NodeVitality) IsLive(usage VitalityUsage) bool {
285343
return nv.isAliveAndConnected()
286344
}
287345
case NetworkMap:
288-
return nv.connected
346+
return nv.nodeConnectionStatus.IsConnected()
289347
case LossOfQuorum:
290348
return nv.isAlive()
291349
case ReplicaGCQueue:
@@ -315,7 +373,7 @@ func (nv NodeVitality) isAvailableNotDraining() bool {
315373
}
316374

317375
func (nv NodeVitality) isAliveAndConnected() bool {
318-
return nv.isAvailableNotDraining() && nv.connected
376+
return nv.isAvailableNotDraining() && nv.nodeConnectionStatus.IsConnected()
319377
}
320378

321379
// isAliveEpoch is used for epoch leases. It is similar to isAlive, but doesn't
@@ -535,7 +593,7 @@ func (l Liveness) CreateNodeVitality(
535593
now hlc.Timestamp,
536594
descUpdateTime hlc.Timestamp,
537595
descUnavailableTime hlc.Timestamp,
538-
connected bool,
596+
connectionStatus *NodeConnectionStatus,
539597
timeUntilNodeDead time.Duration,
540598
timeAfterNodeSuspect time.Duration,
541599
) NodeVitality {
@@ -546,7 +604,7 @@ func (l Liveness) CreateNodeVitality(
546604
nodeID: l.NodeID,
547605
draining: l.Draining,
548606
membership: l.Membership,
549-
connected: connected,
607+
nodeConnectionStatus: connectionStatus,
550608
now: now,
551609
descUpdateTime: descUpdateTime,
552610
descUnavailableTime: descUnavailableTime,

pkg/kv/kvserver/liveness/livenesspb/liveness_test_helper.go

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ import (
1010
"time"
1111

1212
"github.com/cockroachdb/cockroach/pkg/roachpb"
13+
"github.com/cockroachdb/cockroach/pkg/rpc"
1314
"github.com/cockroachdb/cockroach/pkg/util/hlc"
1415
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
16+
"github.com/cockroachdb/errors"
1517
)
1618

1719
// testNodeVitalityEntry is here to minimize the impact on tests of changing to
@@ -49,10 +51,13 @@ func TestCreateNodeVitality(ids ...roachpb.NodeID) TestNodeVitality {
4951

5052
func (e testNodeVitalityEntry) convert() NodeVitality {
5153
now := e.clock.Now()
54+
ncs := NewNodeConnectionStatus(0, nil)
5255
if e.Alive {
53-
return e.Liveness.CreateNodeVitality(now, now, hlc.Timestamp{}, true, time.Second, time.Second)
56+
ncs.SetIsConnected(true)
57+
return e.Liveness.CreateNodeVitality(now, now, hlc.Timestamp{}, ncs, time.Second, time.Second)
5458
} else {
55-
return e.Liveness.CreateNodeVitality(now, now.AddDuration(-time.Hour), hlc.Timestamp{}, false, time.Second, time.Second)
59+
ncs.SetIsConnected(false)
60+
return e.Liveness.CreateNodeVitality(now, now.AddDuration(-time.Hour), hlc.Timestamp{}, ncs, time.Second, time.Second)
5661
}
5762
}
5863

@@ -159,21 +164,49 @@ func (tnv TestNodeVitality) RestartNode(id roachpb.NodeID) {
159164
// FakeNodeVitality creates a node vitality record that is either dead or alive
160165
// by all accounts.
161166
func FakeNodeVitality(alive bool) NodeVitality {
167+
ncs := NewNodeConnectionStatus(1, nil)
162168
if alive {
169+
ncs.SetIsConnected(true)
163170
return NodeVitality{
164171
nodeID: 1,
165-
connected: true,
172+
nodeConnectionStatus: ncs,
166173
now: hlc.Timestamp{}.AddDuration(time.Nanosecond),
167174
timeUntilNodeDead: time.Second,
168175
timeAfterNodeSuspect: time.Second,
169176
livenessExpiration: hlc.Timestamp{}.AddDuration(2 * time.Nanosecond),
170177
livenessEpoch: 1,
171178
}
172179
} else {
180+
ncs.SetIsConnected(false)
173181
return NodeVitality{
174-
nodeID: 1,
175-
connected: false,
176-
livenessEpoch: 1,
182+
nodeID: 1,
183+
nodeConnectionStatus: ncs,
184+
livenessEpoch: 1,
177185
}
178186
}
179187
}
188+
189+
// MockNodeConnectionHealth implements the NodeConnectionHealth interface for
190+
// testing. It just has one field to simulate the connection state.
191+
type MockNodeConnectionHealth struct {
192+
connected bool
193+
}
194+
195+
// NewMockNodeConnectionHealth creates a new MockNodeConnectionHealth with the
196+
// given connection state.
197+
func NewMockNodeConnectionHealth(connected bool) *MockNodeConnectionHealth {
198+
return &MockNodeConnectionHealth{connected: connected}
199+
}
200+
201+
// SetConnected sets the connection state of the mock node connection health.
202+
func (m *MockNodeConnectionHealth) SetConnected(connected bool) {
203+
m.connected = connected
204+
}
205+
206+
// ConnHealth implements the NodeDialer interface.
207+
func (m *MockNodeConnectionHealth) ConnHealth(_ roachpb.NodeID, _ rpc.ConnectionClass) error {
208+
if m.connected {
209+
return nil
210+
}
211+
return errors.Errorf("not connected")
212+
}

0 commit comments

Comments
 (0)