Skip to content

Commit bf78585

Browse files
craig[bot]RaduBerindewenyihu6shubhamdhama
committed
152915: ptcache: fix error logging logic r=RaduBerinde a=RaduBerinde Fix bug introduced in #144335 which accidentally suppresses this log. Epic: none Release note: None 153033: kvserver: return error properly for GetAggregatedStoreStats r=tbg a=wenyihu6 Epic: none Release note: none --- **kvserver: pass replication target rq.TransferLease** Previously, rq.TransferLease was updated to take in replication target (node ID and store ID) instead of just a store ID, but the log line formatting wasn’t updated accordingly. This commit fixes the log formatting. --- **kvserver: return error properly for GetAggregatedStoreStats** Previously, GetAggregatedStoreStats would panic on error for store.Capacity instead of returning it. This commit fixes the issue by properly returning the error back to store.Descriptor. --- **asim: correctly output rangeID** Previously, the log labeled a value as a range but printed the storeID. This commit fixes it to correctly print the rangeID. 153139: testcluster: fix TestDRPCEnabledRandomly for cluster with multiple nodes r=cthumuluru-crdb a=shubhamdhama `` testcluster: fix TestDRPCEnabledRandomly for cluster with multiple nodes `` This change fixes the behavior where it was deciding the DRPC enablement for each node independently for TestDRPCEnabledRandomly option, leading to a cluster startup with some nodes with DRPC enabled, and others not. Now the DRPC option is resolved once at the cluster level and applied consistently to all nodes, ensuring uniform DRPC configuration across the entire test cluster. `` server: fix DRPC option precedence for test-specific settings `` Previously, when a test explicitly set TestDRPCDisabled but a global DRPC override was configured, the global override would incorrectly take precedence. This meant tests that explicitly disabled DRPC could still end up running with DRPC enabled. The fix also adds better logging to indicate when a global override is being applied, making test behavior more transparent. Epic: none Release notes: none Co-authored-by: Radu Berinde <[email protected]> Co-authored-by: wenyihu6 <[email protected]> Co-authored-by: Shubham Dhama <[email protected]>
4 parents ab4a9cb + 9ca8d7d + b566ec1 + 4cab865 commit bf78585

File tree

11 files changed

+108
-56
lines changed

11 files changed

+108
-56
lines changed

pkg/base/test_server_args.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,12 @@ type SlimTestServerConfig struct {
235235
type DefaultTestDRPCOption uint8
236236

237237
const (
238+
// TestDRPCUnset represents an uninitialized or invalid DRPC option.
239+
TestDRPCUnset DefaultTestDRPCOption = iota
240+
238241
// TestDRPCDisabled disables DRPC; all inter-node connectivity will use gRPC
239242
// only.
240-
TestDRPCDisabled DefaultTestDRPCOption = iota
243+
TestDRPCDisabled
241244

242245
// TestDRPCEnabled enables DRPC. Some services may still use gRPC if they
243246
// have not yet migrated to DRPC.

pkg/kv/kvserver/asim/state/impl.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,7 @@ func (s *state) applyLoad(rng *rng, le workload.LoadEvent) {
10891089
func (s *state) RangeUsageInfo(rangeID RangeID, storeID StoreID) allocator.RangeUsageInfo {
10901090
r, ok := s.Range(rangeID)
10911091
if !ok {
1092-
panic(fmt.Sprintf("no leaseholder store found for range %d", storeID))
1092+
panic(fmt.Sprintf("no leaseholder store found for range %d", rangeID))
10931093
}
10941094

10951095
if _, ok = r.Replica(storeID); !ok {

pkg/kv/kvserver/load/node_capacity_provider.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ type StoresStatsAggregator interface {
2424
// GetAggregatedStoreStats returns the total cpu usage across all stores and
2525
// the count of stores. If useCached is true, it uses the cached store
2626
// descriptor instead of computing new ones. Implemented by Stores.
27-
GetAggregatedStoreStats(useCached bool) (aggregatedCPUUsage int64, totalStoreCount int32)
27+
GetAggregatedStoreStats(useCached bool) (aggregatedCPUUsage int64, totalStoreCount int32, err error)
2828
}
2929

3030
// NodeCapacityProvider reports node-level cpu usage and capacity by sampling
@@ -82,8 +82,11 @@ func (n *NodeCapacityProvider) Run(ctx context.Context) {
8282
// capacity and aggregated store-level cpu usage. If useCached is true, it will
8383
// use cached store descriptors to aggregate the sum of store-level cpu
8484
// capacity.
85-
func (n *NodeCapacityProvider) GetNodeCapacity(useCached bool) roachpb.NodeCapacity {
86-
storesCPURate, numStores := n.stores.GetAggregatedStoreStats(useCached)
85+
func (n *NodeCapacityProvider) GetNodeCapacity(useCached bool) (roachpb.NodeCapacity, error) {
86+
storesCPURate, numStores, err := n.stores.GetAggregatedStoreStats(useCached)
87+
if err != nil {
88+
return roachpb.NodeCapacity{}, err
89+
}
8790
// TODO(wenyihu6): may be unexpected to caller that useCached only applies to
8891
// the stores stats but not runtime load monitor. We can change
8992
// runtimeLoadMonitor to also fetch updated stats.
@@ -95,7 +98,7 @@ func (n *NodeCapacityProvider) GetNodeCapacity(useCached bool) roachpb.NodeCapac
9598
NumStores: numStores,
9699
NodeCPURateCapacity: cpuCapacityNanoPerSec,
97100
NodeCPURateUsage: cpuUsageNanoPerSec,
98-
}
101+
}, nil
99102
}
100103

101104
// runtimeLoadMonitor polls cpu usage and capacity stats of the node

pkg/kv/kvserver/load/node_capacity_provider_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ type mockStoresStatsAggregator struct {
2525

2626
func (m *mockStoresStatsAggregator) GetAggregatedStoreStats(
2727
_ bool,
28-
) (totalCPUUsage int64, totalStoreCount int32) {
29-
return m.cpuUsage, m.storeCount
28+
) (totalCPUUsage int64, totalStoreCount int32, _ error) {
29+
return m.cpuUsage, m.storeCount, nil
3030
}
3131

3232
// TestNodeCapacityProvider tests the basic functionality of the
@@ -51,7 +51,8 @@ func TestNodeCapacityProvider(t *testing.T) {
5151

5252
// Provider should have valid stats.
5353
testutils.SucceedsSoon(t, func() error {
54-
nc := provider.GetNodeCapacity(false)
54+
nc, err := provider.GetNodeCapacity(false)
55+
require.NoError(t, err)
5556
if nc.NodeCPURateUsage == 0 || nc.NodeCPURateCapacity == 0 || nc.StoresCPURate == 0 {
5657
return errors.Newf(
5758
"CPU usage or capacity is 0: node cpu rate usage %v, node cpu rate capacity %v, stores cpu rate %v",
@@ -62,7 +63,8 @@ func TestNodeCapacityProvider(t *testing.T) {
6263

6364
cancel()
6465
// GetNodeCapacity should still return valid stats after cancellation.
65-
nc := provider.GetNodeCapacity(false)
66+
nc, err := provider.GetNodeCapacity(false)
67+
require.NoError(t, err)
6668
require.Greater(t, nc.NodeCPURateCapacity, int64(0))
6769
require.Greater(t, nc.NodeCPURateUsage, int64(0))
6870
}

pkg/kv/kvserver/protectedts/ptcache/cache.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,12 @@ func (c *Cache) periodicallyRefreshProtectedtsCache(ctx context.Context) {
188188
return
189189
}
190190
res := future.WaitForResult(ctx)
191-
if res.Err != nil && ctx.Err() != nil {
192-
log.Dev.Errorf(ctx, "failed to refresh protected timestamps: %v", res.Err)
191+
if res.Err != nil {
192+
// We expect an error if the context was cancelled, we don't want to log
193+
// in that case.
194+
if ctx.Err() == nil {
195+
log.Dev.Errorf(ctx, "failed to refresh protected timestamps: %v", res.Err)
196+
}
193197
}
194198
timer.Reset(protectedts.PollInterval.Get(&c.settings.SV))
195199
lastReset = timeutil.Now()

pkg/kv/kvserver/replicate_queue.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,7 +1125,7 @@ func (rq *replicateQueue) TransferLease(
11251125
rangeUsageInfo allocator.RangeUsageInfo,
11261126
) error {
11271127
rq.metrics.TransferLeaseCount.Inc(1)
1128-
log.KvDistribution.Infof(ctx, "transferring lease to s%d", target)
1128+
log.KvDistribution.Infof(ctx, "transferring lease to %v", target)
11291129
// Inform allocator sync that the change has been applied which applies
11301130
// changes to store pool and inform mma.
11311131
changeID := rq.as.NonMMAPreTransferLease(
@@ -1139,7 +1139,7 @@ func (rq *replicateQueue) TransferLease(
11391139
rq.as.PostApply(changeID, err == nil /*success*/)
11401140

11411141
if err != nil {
1142-
return errors.Wrapf(err, "%s: unable to transfer lease to s%d", rlm, target)
1142+
return errors.Wrapf(err, "%s: unable to transfer lease to %v", rlm, target)
11431143
}
11441144
return nil
11451145
}

pkg/kv/kvserver/store.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3295,14 +3295,19 @@ func (s *Store) Descriptor(ctx context.Context, useCached bool) (*roachpb.StoreD
32953295
return nil, err
32963296
}
32973297

3298+
nc, err := s.nodeCapacityProvider.GetNodeCapacity(useCached)
3299+
if err != nil {
3300+
return nil, err
3301+
}
3302+
32983303
// Initialize the store descriptor.
32993304
return &roachpb.StoreDescriptor{
33003305
StoreID: s.Ident.StoreID,
33013306
Attrs: s.Attrs(),
33023307
Node: *s.nodeDesc,
33033308
Capacity: capacity,
33043309
Properties: s.Properties(),
3305-
NodeCapacity: s.nodeCapacityProvider.GetNodeCapacity(useCached),
3310+
NodeCapacity: nc,
33063311
}, nil
33073312
}
33083313

pkg/kv/kvserver/stores.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -338,15 +338,19 @@ func (ls *Stores) GetStoreMetricRegistry(storeID roachpb.StoreID) *metric.Regist
338338

339339
// GetAggregatedStoreStats returns the aggregated cpu usage across all stores and
340340
// the count of stores.
341-
func (ls *Stores) GetAggregatedStoreStats(useCached bool) (storesCPURate int64, numStores int32) {
342-
_ = ls.VisitStores(func(s *Store) error {
341+
func (ls *Stores) GetAggregatedStoreStats(
342+
useCached bool,
343+
) (storesCPURate int64, numStores int32, _ error) {
344+
if err := ls.VisitStores(func(s *Store) error {
343345
c, err := s.Capacity(context.Background(), useCached)
344346
if err != nil {
345-
panic(err)
347+
return err
346348
}
347349
storesCPURate += int64(c.CPUPerSecond)
348350
numStores++
349351
return nil
350-
})
351-
return storesCPURate, numStores
352+
}); err != nil {
353+
return 0, 0, err
354+
}
355+
return storesCPURate, numStores, nil
352356
}

pkg/server/testserver.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,10 @@ func makeTestConfigFromParams(params base.TestServerArgs) Config {
333333
cfg.TestingKnobs.AdmissionControlOptions = &admission.Options{}
334334
}
335335

336+
if params.DefaultDRPCOption == base.TestDRPCEnabled {
337+
rpc.ExperimentalDRPCEnabled.Override(context.Background(), &st.SV, true)
338+
}
339+
336340
return cfg
337341
}
338342

pkg/testutils/serverutils/test_server_shim.go

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,8 @@ import (
2424
"github.com/cockroachdb/cockroach/pkg/kv"
2525
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilitiespb"
2626
"github.com/cockroachdb/cockroach/pkg/roachpb"
27-
"github.com/cockroachdb/cockroach/pkg/rpc"
2827
"github.com/cockroachdb/cockroach/pkg/security/securitytest"
2928
"github.com/cockroachdb/cockroach/pkg/security/username"
30-
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
3129
"github.com/cockroachdb/cockroach/pkg/testutils/pgurlutils"
3230
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
3331
"github.com/cockroachdb/cockroach/pkg/util/envutil"
@@ -274,11 +272,9 @@ func StartServerOnlyE(t TestLogger, params base.TestServerArgs) (TestServerInter
274272
ctx := context.Background()
275273
allowAdditionalTenants := params.DefaultTestTenant.AllowAdditionalTenants()
276274

277-
// Update the flags with the actual decision as to whether we should
278-
// start the service for a default test tenant.
275+
// Update the flags with the actual decisions for test configuration.
279276
params.DefaultTestTenant = ShouldStartDefaultTestTenant(t, params.DefaultTestTenant)
280-
281-
TryEnableDRPCSetting(ctx, t, &params)
277+
params.DefaultDRPCOption = ShouldEnableDRPC(ctx, t, params.DefaultDRPCOption)
282278

283279
s, err := NewServer(params)
284280
if err != nil {
@@ -541,13 +537,16 @@ func WaitForTenantCapabilities(
541537
}
542538
}
543539

544-
// TryEnableDRPCSetting determines whether to enable the DRPC cluster setting
545-
// based on the `TestServerArgs.DefaultDRPCOption` and updates the
546-
// `TestServerArgs.Settings` based on that.
547-
func TryEnableDRPCSetting(ctx context.Context, t TestLogger, args *base.TestServerArgs) {
548-
option := args.DefaultDRPCOption
549-
if option == base.TestDRPCDisabled && globalDefaultDRPCOptionOverride.isSet {
540+
// ShouldEnableDRPC determines the final DRPC option based on the input
541+
// option and any global overrides, resolving random choices to a concrete
542+
// enabled/disabled state.
543+
func ShouldEnableDRPC(
544+
ctx context.Context, t TestLogger, option base.DefaultTestDRPCOption,
545+
) base.DefaultTestDRPCOption {
546+
var logSuffix string
547+
if option == base.TestDRPCUnset && globalDefaultDRPCOptionOverride.isSet {
550548
option = globalDefaultDRPCOptionOverride.value
549+
logSuffix = " (override by TestingGlobalDRPCOption)"
551550
}
552551
enableDRPC := false
553552
switch option {
@@ -557,12 +556,11 @@ func TryEnableDRPCSetting(ctx context.Context, t TestLogger, args *base.TestServ
557556
rng, _ := randutil.NewTestRand()
558557
enableDRPC = rng.Intn(2) == 0
559558
}
560-
if !enableDRPC {
561-
return
562-
}
563559

564-
if args.Settings == nil {
565-
args.Settings = cluster.MakeClusterSettings()
560+
if enableDRPC {
561+
t.Log("DRPC is enabled" + logSuffix)
562+
return base.TestDRPCEnabled
566563
}
567-
rpc.ExperimentalDRPCEnabled.Override(ctx, &args.Settings.SV, true)
564+
565+
return base.TestDRPCDisabled
568566
}

0 commit comments

Comments
 (0)