Skip to content

Commit 6fcec70

Browse files
craig[bot]kev-caowenyihu6
committed
150157: roachtest: validate number of tables in database for fingerprinting r=msbutler a=kev-cao Continuing to get failures with missing tables in the backup fixtures, this commit adds a stricter validation to our backup fixtures to ensure that the number of tables are being fingerprinted. This should prevent invalid fingerprints and fixtures from being created, which should reduce our test failures on restore and isolate the failures to the real culprit, the fixture tests. Informs: #149671 Fixes: #149772, #149773, #149769 Release note: None 150192: asim: follow up for #149904 r=tbg a=wenyihu6 **asim: use cluster setting LBRebalancingMode directly** Previously, LBRebalancingMode existed both as a simulation setting and a cluster setting, which was confusing especially now that we're plumbing the cluster setting through. This commit removes the simulation specific setting and makes the simulator use the cluster setting directly instead. Note that default behavior is leases mode and replicas=2 remains unchanged for both. Epic: none Release note: none --- **mmaprototypehelpers: skip mma coordination when mma is disabled** Previously, non-mma components registered external changes with mma via allocator sync even when mma was disabled. This led to logs like `did not register external changes: due to range does not exist in cluster state`. Without mma being enabled, mma can’t register external changes because it never learns about the ranges via processStoreLeaseholderMsg in the first place, and this external change would get dropped. This commit changes allocator sync to skip coordinating with the mma allocator when mma is disabled. Note that we changes still go via allocator sync even when mma is disabled to update the local store pool correctly. Epic: none Release note: none --- **asim: add more data driven tests** This commit adds more data-driven tests. Output looks the same as 49d07ce. Note that long-running test files below are skipped in normal CI using the skip_under_ci datadriven command but will still run under the extra stress pipeline. ``` example_liveness example_fulldisk mma_high_cpu_25nodes mma_skewed_cpu_skewed_write_more_ranges mma_skewed_cpu_skewed_write_all_enabled mma_one_voter_skewed_cpu_skewed_write mma_skewed_cpu_skewed_write mma_high_write_uniform_cpu_all_enabled mma_high_write_uniform_cpu example_skewed_cpu_even_ranges_mma_and_queues mma_constraint_satisfaction1 mma_high_cpu example_splitting example_rebalancing example_skewed_cpu_even_ranges_sma_and_queues example_skewed_cpu_even_ranges_mma example_skewed_cpu_even_ranges_sma example_io_overload mma_constraint_satisfaction_old_alloc example_zone_config ``` Epic: none Release note: none Co-authored-by: Kevin Cao <[email protected]> Co-authored-by: wenyihu6 <[email protected]>
3 parents 8f667c4 + 8157c5d + 6ac8bb5 commit 6fcec70

36 files changed

+2948
-140
lines changed

pkg/cmd/roachtest/tests/backup_fixtures.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ import (
4949
// and rely on the managed identity to authenticate with Azure Blob Storage.
5050
const azureCredentialsFilePath = "/home/ubuntu/azure-credentials.yaml"
5151

52+
// Maps a fixture database name to the expected number of tables in the
53+
// database, useful for verifying that the fingerprint of the fixture is as
54+
// expected.
55+
var expectedNumTables = map[string]int{
56+
"tpcc": 9,
57+
}
58+
5259
type BackupFixture interface {
5360
Kind() string
5461
// The database that is backed up.
@@ -468,6 +475,7 @@ func fingerprintDatabase(
468475
t.L().Printf("no tables found in database %s", dbName)
469476
return nil
470477
}
478+
require.Len(t, tables, expectedNumTables[dbName], "unexpected number of tables in database %s", dbName)
471479
t.L().Printf("fingerprinting %d tables in database %s", len(tables), dbName)
472480

473481
fingerprints := make(map[string]string)
@@ -493,6 +501,10 @@ func fingerprintDatabase(
493501
"fingerprinted %d tables in %s in %s",
494502
len(tables), dbName, timeutil.Since(start),
495503
)
504+
require.Len(
505+
t, fingerprints, expectedNumTables[dbName],
506+
"unexpected number of fingerprints for database %s", dbName,
507+
)
496508
return fingerprints
497509
}
498510

@@ -513,6 +525,7 @@ func getDatabaseTables(t test.Test, sql *sqlutils.SQLRunner, db string) []string
513525
}
514526
tables = append(tables, fmt.Sprintf(`%s.%s.%s`, db, schemaName, tableName))
515527
}
528+
require.NoError(t, rows.Err(), "error iterating over tables in database %s", db)
516529
return tables
517530
}
518531

pkg/kv/kvserver/allocator/mmaprototypehelpers/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ go_library(
1010
visibility = ["//visibility:public"],
1111
deps = [
1212
"//pkg/kv/kvpb",
13+
"//pkg/kv/kvserver",
1314
"//pkg/kv/kvserver/allocator",
1415
"//pkg/kv/kvserver/allocator/mmaprototype",
1516
"//pkg/kv/kvserver/allocator/storepool",
1617
"//pkg/roachpb",
18+
"//pkg/settings/cluster",
1719
"//pkg/util/log",
1820
"//pkg/util/syncutil",
1921
"//pkg/util/timeutil",

pkg/kv/kvserver/allocator/mmaprototypehelpers/kvserver_mma_integration.go

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ import (
1010
"fmt"
1111

1212
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
13+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
1314
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator"
1415
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/mmaprototype"
1516
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool"
1617
"github.com/cockroachdb/cockroach/pkg/roachpb"
18+
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
1719
"github.com/cockroachdb/cockroach/pkg/util/log"
1820
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
1921
)
@@ -62,17 +64,21 @@ func (s Author) External() bool {
6264
type AllocatorSync struct {
6365
sp *storepool.StorePool
6466
mmAllocator mmaprototype.Allocator
67+
st *cluster.Settings
6568
mu struct {
6669
syncutil.Mutex
6770
changeSeqGen SyncChangeID
6871
trackedChanges map[SyncChangeID]trackedAllocatorChange
6972
}
7073
}
7174

72-
func NewAllocatorSync(sp *storepool.StorePool, mmAllocator mmaprototype.Allocator) *AllocatorSync {
75+
func NewAllocatorSync(
76+
sp *storepool.StorePool, mmAllocator mmaprototype.Allocator, st *cluster.Settings,
77+
) *AllocatorSync {
7378
as := &AllocatorSync{
7479
sp: sp,
7580
mmAllocator: mmAllocator,
81+
st: st,
7682
}
7783
as.mu.trackedChanges = make(map[SyncChangeID]trackedAllocatorChange)
7884
return as
@@ -122,9 +128,12 @@ func (as *AllocatorSync) NonMMAPreTransferLease(
122128
)
123129
log.Infof(ctx, "registering external lease transfer change: usage=%v changes=%v",
124130
usage, replicaChanges)
125-
changeIDs := as.mmAllocator.RegisterExternalChanges(replicaChanges[:])
126-
if changeIDs == nil {
127-
log.Info(ctx, "mma did not track lease transfer, skipping")
131+
var changeIDs []mmaprototype.ChangeID
132+
if kvserver.LoadBasedRebalancingMode.Get(&as.st.SV) == kvserver.LBRebalancingMultiMetric {
133+
changeIDs = as.mmAllocator.RegisterExternalChanges(replicaChanges[:])
134+
if changeIDs == nil {
135+
log.Info(ctx, "mma did not track lease transfer, skipping")
136+
}
128137
}
129138
trackedChange := trackedAllocatorChange{
130139
typ: AllocatorChangeTypeLeaseTransfer,
@@ -210,9 +219,12 @@ func (as *AllocatorSync) NonMMAPreChangeReplicas(
210219

211220
log.Infof(ctx, "registering external replica change: chgs=%v usage=%v changes=%v",
212221
changes, usage, replicaChanges)
213-
changeIDs := as.mmAllocator.RegisterExternalChanges(replicaChanges)
214-
if changeIDs == nil {
215-
log.Info(ctx, "cluster does not have a range for the external replica change, skipping")
222+
var changeIDs []mmaprototype.ChangeID
223+
if kvserver.LoadBasedRebalancingMode.Get(&as.st.SV) == kvserver.LBRebalancingMultiMetric {
224+
changeIDs = as.mmAllocator.RegisterExternalChanges(replicaChanges)
225+
if changeIDs == nil {
226+
log.Info(ctx, "cluster does not have a range for the external replica change, skipping")
227+
}
216228
}
217229
trackedChange := trackedAllocatorChange{
218230
typ: AllocatorChangeTypeChangeReplicas,
@@ -335,12 +347,14 @@ func (as *AllocatorSync) PostApply(ctx context.Context, syncChangeID SyncChangeI
335347
}
336348
delete(as.mu.trackedChanges, syncChangeID)
337349
}()
338-
if changeIDs := tracked.changeIDs; changeIDs != nil {
339-
log.Infof(ctx, "PostApply: tracked=%v change_ids=%v success: %v", tracked, changeIDs, success)
340-
as.updateMetrics(success, tracked.typ, tracked.author)
341-
as.mmAllocator.AdjustPendingChangesDisposition(changeIDs, success)
342-
} else {
343-
log.Infof(ctx, "PostApply: tracked=%v no change_ids success: %v", tracked, success)
350+
if kvserver.LoadBasedRebalancingMode.Get(&as.st.SV) == kvserver.LBRebalancingMultiMetric {
351+
if changeIDs := tracked.changeIDs; changeIDs != nil {
352+
log.Infof(ctx, "PostApply: tracked=%v change_ids=%v success: %v", tracked, changeIDs, success)
353+
as.updateMetrics(success, tracked.typ, tracked.author)
354+
as.mmAllocator.AdjustPendingChangesDisposition(changeIDs, success)
355+
} else {
356+
log.Infof(ctx, "PostApply: tracked=%v no change_ids success: %v", tracked, success)
357+
}
344358
}
345359
as.updateMetrics(success, tracked.typ, tracked.author)
346360
if !success {

pkg/kv/kvserver/asim/config/settings.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ const (
2727
defaultSplitQPSThreshold = 2500
2828
defaultSplitStatRetention = 10 * time.Minute
2929
defaultSeed = 42
30-
defaultLBRebalancingMode = 2 // Leases and replicas.
3130
defaultLBRebalancingInterval = time.Minute
3231
defaultLBRebalanceQPSThreshold = 0.1
3332
defaultLBRebalancingObjective = 0 // QPS
@@ -96,9 +95,6 @@ type SimulationSettings struct {
9695
// SplitStatRetention is the duration which recorded load will be retained
9796
// and factored into load based splitting decisions.
9897
SplitStatRetention time.Duration
99-
// LBRebalancingMode controls if and when we do store-level rebalancing
100-
// based on load. It maps to kvserver.LBRebalancingMode.
101-
LBRebalancingMode int64
10298
// LBRebalancingObjective is the load objective to balance.
10399
LBRebalancingObjective int64
104100
// LBRebalancingInterval controls how often the store rebalancer will
@@ -136,7 +132,6 @@ func DefaultSimulationSettings() *SimulationSettings {
136132
StateExchangeDelay: defaultStateExchangeDelay,
137133
SplitQPSThreshold: defaultSplitQPSThreshold,
138134
SplitStatRetention: defaultSplitStatRetention,
139-
LBRebalancingMode: defaultLBRebalancingMode,
140135
LBRebalancingObjective: defaultLBRebalancingObjective,
141136
LBRebalancingInterval: defaultLBRebalancingInterval,
142137
ReplicateQueueEnabled: true,

pkg/kv/kvserver/asim/event/mutation_event.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,9 @@ type SetNodeLocalityEvent struct {
5454
// SetSimulationSettingsEvent represents a mutation event responsible for
5555
// changing a simulation setting during the simulation.
5656
type SetSimulationSettingsEvent struct {
57-
Key string
58-
Value interface{}
57+
IsClusterSetting bool
58+
Key string
59+
Value interface{}
5960
}
6061

6162
var _ Event = &SetSpanConfigEvent{}
@@ -140,7 +141,11 @@ func (sne SetNodeLocalityEvent) String() string {
140141

141142
func (se SetSimulationSettingsEvent) Func() EventFunc {
142143
return MutationFunc(func(ctx context.Context, s state.State) {
143-
s.SetSimulationSettings(se.Key, se.Value)
144+
if se.IsClusterSetting {
145+
s.SetClusterSetting(se.Key, se.Value)
146+
} else {
147+
s.SetSimulationSettings(se.Key, se.Value)
148+
}
144149
})
145150
}
146151

pkg/kv/kvserver/asim/mmaintegration/mma_store_rebalancer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ func (msr *MMAStoreRebalancer) Tick(ctx context.Context, tick time.Time, s state
9797
return
9898
}
9999

100-
if msr.settings.LBRebalancingMode != int64(kvserver.LBRebalancingMultiMetric) {
100+
if kvserver.LoadBasedRebalancingMode.Get(&msr.settings.ST.SV) != kvserver.LBRebalancingMultiMetric {
101101
// When the store rebalancer isn't set to use the multi-metric mode, the
102102
// legacy store rebalancer is used.
103103
return

pkg/kv/kvserver/asim/state/impl.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,9 +430,9 @@ func (s *state) AddNode() Node {
430430
nodeID: nodeID,
431431
desc: roachpb.NodeDescriptor{NodeID: roachpb.NodeID(nodeID)},
432432
stores: []StoreID{},
433-
storepool: sp,
434433
mmAllocator: mmAllocator,
435-
as: mmaprototypehelpers.NewAllocatorSync(sp, mmAllocator),
434+
storepool: sp,
435+
as: mmaprototypehelpers.NewAllocatorSync(sp, mmAllocator, s.settings.ST),
436436
}
437437
s.nodes[nodeID] = node
438438
s.SetNodeLiveness(nodeID, livenesspb.NodeLivenessStatus_LIVE)
@@ -1393,6 +1393,15 @@ func (s *state) RegisterConfigChangeListener(listener ConfigChangeListener) {
13931393
s.configChangeListeners = append(s.configChangeListeners, listener)
13941394
}
13951395

1396+
func (s *state) SetClusterSetting(Key string, Value interface{}) {
1397+
switch Key {
1398+
case "LBRebalancingMode":
1399+
kvserver.LoadBasedRebalancingMode.Override(context.Background(), &s.settings.ST.SV, kvserver.LBRebalancingMode(Value.(int64)))
1400+
default:
1401+
panic("other cluster settings not supported")
1402+
}
1403+
}
1404+
13961405
// SetSimulationSettings sets the simulation setting for the given key to the
13971406
// given value.
13981407
func (s *state) SetSimulationSettings(Key string, Value interface{}) {

pkg/kv/kvserver/asim/state/state.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ type State interface {
202202
RegisterConfigChangeListener(ConfigChangeListener)
203203
// SetSimulationSettings sets the simulation settings for the state.
204204
SetSimulationSettings(Key string, Value interface{})
205+
// SetClusterSetting sets the cluster setting for the state.
206+
SetClusterSetting(Key string, Value interface{})
205207
// NodeCapacity returns the capacity of the node with ID NodeID.
206208
NodeCapacity(NodeID) roachpb.NodeCapacity
207209
// SetNodeCPURateCapacity sets the CPU rate capacity for the node with ID

pkg/kv/kvserver/asim/storerebalancer/store_rebalancer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ func (src *storeRebalancerControl) phasePrologue(
190190
s, src.storeID,
191191
kvserver.LBRebalancingObjective(src.settings.LBRebalancingObjective).ToDimension(),
192192
),
193-
kvserver.LBRebalancingMode(src.settings.LBRebalancingMode),
193+
kvserver.LoadBasedRebalancingMode.Get(&src.settings.ST.SV),
194194
)
195195

196196
if !src.sr.ShouldRebalanceStore(ctx, rctx) {

pkg/kv/kvserver/asim/tests/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ go_test(
3838
data = glob(["testdata/**"]),
3939
embed = [":tests"],
4040
deps = [
41+
"//pkg/kv/kvserver",
4142
"//pkg/kv/kvserver/allocator/allocatorimpl",
4243
"//pkg/kv/kvserver/asim/assertion",
4344
"//pkg/kv/kvserver/asim/config",

0 commit comments

Comments
 (0)