Skip to content

Commit dabadf5

Browse files
committed
liveness: lazily calculate the node connectivity inside NodeVitality
Before this commit, always calculate if we are connected to the node as part of the NodeVitality creation. However, most uses of node vitality don't need that calculation to be made. This commit addresses this performance issue by lazily calculating the connected field only when requested. Fixes: #143144 Release note: None
1 parent 588139e commit dabadf5

File tree

7 files changed

+146
-17
lines changed

7 files changed

+146
-17
lines changed

pkg/kv/kvserver/asim/state/liveness.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,13 @@ func convertNodeStatusToNodeVitality(
8989
l.Expiration = liveTs
9090
l.Draining = true
9191
}
92+
ncs := livenesspb.NewNodeConnectionStatus(nid, nil)
93+
ncs.SetIsConnected(true)
9294
entry := l.CreateNodeVitality(
9395
now, /* now */
9496
hlc.Timestamp{}, /* descUpdateTime */
9597
hlc.Timestamp{}, /* descUnavailableTime */
96-
true, /* connected */
98+
ncs, /* nodeConnectionStatus */
9799
timeUntilNodeDead, /* timeUntilNodeDead */
98100
0, /* timeAfterNodeSuspect */
99101
)

pkg/kv/kvserver/liveness/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ go_library(
1818
"//pkg/kv/kvpb",
1919
"//pkg/kv/kvserver/liveness/livenesspb",
2020
"//pkg/roachpb",
21-
"//pkg/rpc",
2221
"//pkg/rpc/nodedialer",
2322
"//pkg/server/telemetry",
2423
"//pkg/settings",

pkg/kv/kvserver/liveness/cache.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"github.com/cockroachdb/cockroach/pkg/gossip"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
1515
"github.com/cockroachdb/cockroach/pkg/roachpb"
16-
"github.com/cockroachdb/cockroach/pkg/rpc"
1716
"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
1817
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
1918
"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -244,14 +243,13 @@ func (c *Cache) convertToNodeVitality(l livenesspb.Liveness) livenesspb.NodeVita
244243
// even before the first gossip arrives for a store.
245244

246245
// NB: nodeDialer is nil in some tests.
247-
connected := c.nodeDialer == nil || c.nodeDialer.ConnHealth(l.NodeID, rpc.SystemClass) == nil
248246
lastDescUpdate := c.lastDescriptorUpdate(l.NodeID)
249247

250248
return l.CreateNodeVitality(
251249
c.clock.Now(),
252250
lastDescUpdate.lastUpdateTime,
253251
lastDescUpdate.lastUnavailableTime,
254-
connected,
252+
livenesspb.NewNodeConnectionStatus(l.NodeID, c.nodeDialer),
255253
TimeUntilNodeDead.Get(&c.st.SV),
256254
TimeAfterNodeSuspect.Get(&c.st.SV),
257255
)

pkg/kv/kvserver/liveness/client_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,3 +417,41 @@ func TestLivenessRangeGetsPeriodicallyCompacted(t *testing.T) {
417417
return nil
418418
})
419419
}
420+
421+
// TestNodeConnectionStatusIsConnected verifies that NodeConnectionStatus caches
422+
// the connection status after the first IsConnected() call.
423+
func TestNodeConnectionStatusIsConnected(t *testing.T) {
424+
defer leaktest.AfterTest(t)()
425+
defer log.Scope(t).Close(t)
426+
427+
testCases := []struct {
428+
name string
429+
initialConnected bool
430+
}{
431+
{
432+
name: "initially connected",
433+
initialConnected: true,
434+
},
435+
{
436+
name: "initially disconnected",
437+
initialConnected: false,
438+
},
439+
}
440+
441+
for _, tc := range testCases {
442+
t.Run(tc.name, func(t *testing.T) {
443+
// Create a mock node dialer with the initial connection state.
444+
connHealth := livenesspb.NewMockNodeConnectionHealth(tc.initialConnected)
445+
ncs := livenesspb.NewNodeConnectionStatus(roachpb.NodeID(1), connHealth)
446+
447+
// First call should calculate and cache the connection status.
448+
require.Equal(t, tc.initialConnected, ncs.IsConnected())
449+
450+
// Modify the mock to simulate a connection state change.
451+
connHealth.SetConnected(!tc.initialConnected)
452+
453+
// Subsequent calls should use the cached value (still the initial state).
454+
require.Equal(t, tc.initialConnected, ncs.IsConnected())
455+
})
456+
}
457+
}

pkg/kv/kvserver/liveness/livenesspb/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ go_library(
1313
visibility = ["//visibility:public"],
1414
deps = [
1515
"//pkg/roachpb",
16+
"//pkg/rpc",
1617
"//pkg/util/hlc",
1718
"//pkg/util/timeutil",
1819
"@com_github_cockroachdb_errors//:errors",

pkg/kv/kvserver/liveness/livenesspb/liveness.go

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ package livenesspb
88
import (
99
"context"
1010
"fmt"
11+
"sync/atomic"
1112
"time"
1213

1314
"github.com/cockroachdb/cockroach/pkg/roachpb"
15+
"github.com/cockroachdb/cockroach/pkg/rpc"
1416
"github.com/cockroachdb/cockroach/pkg/util/hlc"
1517
"github.com/cockroachdb/errors"
1618
"github.com/cockroachdb/redact"
@@ -159,6 +161,61 @@ type IsLiveMapEntry struct {
159161
// IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry.
160162
type IsLiveMap map[roachpb.NodeID]IsLiveMapEntry
161163

164+
// NodeConnectionHealth represents the minimal interface needed for checking if
165+
// a node RPC connection is alive.
166+
type NodeConnectionHealth interface {
167+
// ConnHealth returns nil if we have an open connection of the request
168+
// class that works successfully. Otherwise, it returns an error.
169+
ConnHealth(nodeID roachpb.NodeID, class rpc.ConnectionClass) error
170+
}
171+
172+
// NodeConnectionStatus is a lightweight wrapper around the
173+
// NodeConnectionHealth, where calculating the connection status is done lazily
174+
// upon the first call to IsConnected(). This is useful as a member in structs
175+
// where we sometimes want to calculate the connection status, and sometimes we
176+
// don't.
177+
type NodeConnectionStatus struct {
178+
nodeConnectionHealth NodeConnectionHealth
179+
nodeID roachpb.NodeID
180+
// calculatedConnected specifies whether we did calculate the connected field
181+
// or not. If we haven't, we can't rely on the value of connected.
182+
calculatedConnected atomic.Bool
183+
// connected is an atomic boolean that tracks whether we are connected to the node.
184+
connected atomic.Bool
185+
}
186+
187+
func NewNodeConnectionStatus(
188+
nodeID roachpb.NodeID, nodeConnectionHealth NodeConnectionHealth,
189+
) *NodeConnectionStatus {
190+
ncs := &NodeConnectionStatus{
191+
nodeConnectionHealth: nodeConnectionHealth,
192+
nodeID: nodeID,
193+
}
194+
return ncs
195+
}
196+
197+
// SetIsConnected changes the connection status of the node.
198+
func (ncs *NodeConnectionStatus) SetIsConnected(connected bool) {
199+
ncs.connected.Store(connected)
200+
ncs.calculatedConnected.Store(true)
201+
}
202+
203+
// IsConnected checks if we are connected to the supplied nodeID. It only
204+
// performs the calculation the first time. Future calls will use the cached
205+
// version.
206+
func (ncs *NodeConnectionStatus) IsConnected() bool {
207+
if !ncs.calculatedConnected.Load() {
208+
// Calculate the connection status if we haven't done that before.
209+
// Some tests will set the nodeDialer to nil, so we need to check for that.
210+
connected := ncs.nodeConnectionHealth == nil ||
211+
ncs.nodeConnectionHealth.ConnHealth(ncs.nodeID, rpc.SystemClass) == nil
212+
ncs.SetIsConnected(connected)
213+
ncs.calculatedConnected.Store(true)
214+
}
215+
216+
return ncs.connected.Load()
217+
}
218+
162219
// NodeVitality should be used any place other than epoch leases where it is
163220
// necessary to determine if a node is currently alive and what its health is.
164221
// Aliveness and deadness are concepts that refer to our best guess of the
@@ -173,8 +230,9 @@ type NodeVitality struct {
173230
draining bool
174231
// membership is whether the node is active or in a state of decommissioning.
175232
membership MembershipStatus
176-
// connected is whether we are currently directly connect to this node.
177-
connected bool
233+
// nodeConnectionStatus calculates whether we are currently directly connect
234+
// to this node.
235+
nodeConnectionStatus *NodeConnectionStatus
178236

179237
// When the record is created. Records are not held for long, but they should
180238
// always give consistent results when asked.
@@ -285,7 +343,7 @@ func (nv NodeVitality) IsLive(usage VitalityUsage) bool {
285343
return nv.isAliveAndConnected()
286344
}
287345
case NetworkMap:
288-
return nv.connected
346+
return nv.nodeConnectionStatus.IsConnected()
289347
case LossOfQuorum:
290348
return nv.isAlive()
291349
case ReplicaGCQueue:
@@ -315,7 +373,7 @@ func (nv NodeVitality) isAvailableNotDraining() bool {
315373
}
316374

317375
func (nv NodeVitality) isAliveAndConnected() bool {
318-
return nv.isAvailableNotDraining() && nv.connected
376+
return nv.isAvailableNotDraining() && nv.nodeConnectionStatus.IsConnected()
319377
}
320378

321379
// isAliveEpoch is used for epoch leases. It is similar to isAlive, but doesn't
@@ -535,7 +593,7 @@ func (l Liveness) CreateNodeVitality(
535593
now hlc.Timestamp,
536594
descUpdateTime hlc.Timestamp,
537595
descUnavailableTime hlc.Timestamp,
538-
connected bool,
596+
connectionStatus *NodeConnectionStatus,
539597
timeUntilNodeDead time.Duration,
540598
timeAfterNodeSuspect time.Duration,
541599
) NodeVitality {
@@ -546,7 +604,7 @@ func (l Liveness) CreateNodeVitality(
546604
nodeID: l.NodeID,
547605
draining: l.Draining,
548606
membership: l.Membership,
549-
connected: connected,
607+
nodeConnectionStatus: connectionStatus,
550608
now: now,
551609
descUpdateTime: descUpdateTime,
552610
descUnavailableTime: descUnavailableTime,

pkg/kv/kvserver/liveness/livenesspb/liveness_test_helper.go

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ import (
1010
"time"
1111

1212
"github.com/cockroachdb/cockroach/pkg/roachpb"
13+
"github.com/cockroachdb/cockroach/pkg/rpc"
1314
"github.com/cockroachdb/cockroach/pkg/util/hlc"
1415
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
16+
"github.com/cockroachdb/errors"
1517
)
1618

1719
// testNodeVitalityEntry is here to minimize the impact on tests of changing to
@@ -49,10 +51,13 @@ func TestCreateNodeVitality(ids ...roachpb.NodeID) TestNodeVitality {
4951

5052
func (e testNodeVitalityEntry) convert() NodeVitality {
5153
now := e.clock.Now()
54+
ncs := NewNodeConnectionStatus(0, nil)
5255
if e.Alive {
53-
return e.Liveness.CreateNodeVitality(now, now, hlc.Timestamp{}, true, time.Second, time.Second)
56+
ncs.SetIsConnected(true)
57+
return e.Liveness.CreateNodeVitality(now, now, hlc.Timestamp{}, ncs, time.Second, time.Second)
5458
} else {
55-
return e.Liveness.CreateNodeVitality(now, now.AddDuration(-time.Hour), hlc.Timestamp{}, false, time.Second, time.Second)
59+
ncs.SetIsConnected(false)
60+
return e.Liveness.CreateNodeVitality(now, now.AddDuration(-time.Hour), hlc.Timestamp{}, ncs, time.Second, time.Second)
5661
}
5762
}
5863

@@ -159,21 +164,49 @@ func (tnv TestNodeVitality) RestartNode(id roachpb.NodeID) {
159164
// FakeNodeVitality creates a node vitality record that is either dead or alive
160165
// by all accounts.
161166
func FakeNodeVitality(alive bool) NodeVitality {
167+
ncs := NewNodeConnectionStatus(1, nil)
162168
if alive {
169+
ncs.SetIsConnected(true)
163170
return NodeVitality{
164171
nodeID: 1,
165-
connected: true,
172+
nodeConnectionStatus: ncs,
166173
now: hlc.Timestamp{}.AddDuration(time.Nanosecond),
167174
timeUntilNodeDead: time.Second,
168175
timeAfterNodeSuspect: time.Second,
169176
livenessExpiration: hlc.Timestamp{}.AddDuration(2 * time.Nanosecond),
170177
livenessEpoch: 1,
171178
}
172179
} else {
180+
ncs.SetIsConnected(false)
173181
return NodeVitality{
174-
nodeID: 1,
175-
connected: false,
176-
livenessEpoch: 1,
182+
nodeID: 1,
183+
nodeConnectionStatus: ncs,
184+
livenessEpoch: 1,
177185
}
178186
}
179187
}
188+
189+
// MockNodeConnectionHealth implements the NodeConnectionHealth interface for
190+
// testing. It just has one field to simulate the connection state.
191+
type MockNodeConnectionHealth struct {
192+
connected bool
193+
}
194+
195+
// NewMockNodeConnectionHealth creates a new MockNodeConnectionHealth with the
196+
// given connection state.
197+
func NewMockNodeConnectionHealth(connected bool) *MockNodeConnectionHealth {
198+
return &MockNodeConnectionHealth{connected: connected}
199+
}
200+
201+
// SetConnected sets the connection state of the mock node connection health.
202+
func (m *MockNodeConnectionHealth) SetConnected(connected bool) {
203+
m.connected = connected
204+
}
205+
206+
// ConnHealth implements the NodeDialer interface.
207+
func (m *MockNodeConnectionHealth) ConnHealth(_ roachpb.NodeID, _ rpc.ConnectionClass) error {
208+
if m.connected {
209+
return nil
210+
}
211+
return errors.Errorf("not connected")
212+
}

0 commit comments

Comments
 (0)