Skip to content

Commit e5415f7

Browse files
craig[bot]tbgwenyihu6
committed
Merge #154513
154513: asim: print generated load, trace StoreRebalancer r=wenyihu6 a=tbg See individual commits. Epic: CRDB-49117 Co-authored-by: Tobias Grieger <[email protected]> Co-authored-by: wenyihu6 <[email protected]>
2 parents 3135118 + ecf4a86 commit e5415f7

15 files changed

+74
-13
lines changed

pkg/kv/kvserver/asim/asim.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,18 @@ func NewSimulator(
9898
changer := state.NewReplicaChanger()
9999
controllers := make(map[state.StoreID]op.Controller)
100100

101+
var onRecording func(storeID state.StoreID, atDuration time.Duration, rec tracingpb.Recording)
102+
if fn := settings.OnRecording; fn != nil {
103+
onRecording = func(storeID state.StoreID, atDuration time.Duration, rec tracingpb.Recording) {
104+
fn(int64(storeID), atDuration, rec)
105+
}
106+
}
107+
101108
s := &Simulator{
102109
AmbientContext: log.MakeTestingAmbientCtxWithNewTracer(),
103-
onRecording: func(storeID state.StoreID, atDuration time.Duration, rec tracingpb.Recording) {
104-
if fn := settings.OnRecording; fn != nil {
105-
fn(int64(storeID), atDuration, rec)
106-
}
107-
},
110+
// onRecording is intentionally nil if settings.OnRecording is nil, to
111+
// short-circuit trace creation overhead in that case.
112+
onRecording: onRecording,
108113
curr: settings.StartTime,
109114
end: settings.StartTime.Add(duration),
110115
interval: settings.TickInterval,
@@ -404,7 +409,9 @@ func (s *Simulator) tickStoreRebalancers(ctx context.Context, tick time.Time, st
404409
stores := s.state.Stores()
405410
s.shuffler(len(stores), func(i, j int) { stores[i], stores[j] = stores[j], stores[i] })
406411
for _, store := range stores {
407-
s.srs[store.StoreID()].Tick(ctx, tick, state)
412+
s.doAndMaybeTrace(ctx, store.StoreID(), tick, "StoreRebalancer", func(ctx context.Context) {
413+
s.srs[store.StoreID()].Tick(ctx, tick, state)
414+
})
408415
}
409416
}
410417

pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -244,15 +244,17 @@ func TestDataDriven(t *testing.T) {
244244
// CPU consumptions. This isn't exact because it doesn't account for
245245
// replication, but it's close enough.
246246
// NB: writes also consume requestCPUPerAccess.
247-
approxVCPUs := (rate * (float64(requestCPUPerAccess) + float64(raftCPUPerAccess)*(1.0-rwRatio))) / 1e9
247+
accessVCPUs := rate * float64(requestCPUPerAccess) / 1e9
248+
beforeReplicationRaftVCPUs := rate * (1 - rwRatio) * float64(raftCPUPerAccess) / 1e9
249+
approxVCPUs := accessVCPUs + beforeReplicationRaftVCPUs
248250
// Ditto for writes. Here too we don't account for replication. Note
249251
// that at least under uniform writes, real clusters can have a write
250252
// amp that easily surpasses 20, so writing at 40mb/s to a small set
251253
// of stores would often constitute an issue in production.
252-
approxWriteBytes := float64(maxBlock+minBlock) * rate * (1.0 - rwRatio) / 2
254+
beforeReplicationWriteBytes := float64(maxBlock+minBlock) * rate * (1.0 - rwRatio) / 2
253255

254256
const tenkb = 10 * 1024
255-
neitherWriteNorCPUHeavy := approxWriteBytes < tenkb && approxVCPUs < .5
257+
neitherWriteNorCPUHeavy := beforeReplicationWriteBytes < tenkb && approxVCPUs < .5
256258

257259
// We tolerate abnormally low CPU if there's a sensible amount of
258260
// write load. Otherwise, it's likely a mistake.
@@ -261,9 +263,24 @@ func TestDataDriven(t *testing.T) {
261263
}
262264
// Similarly, tolerate abnormally low write load when there's
263265
// significant CPU. Independently, call out high write load.
264-
if (neitherWriteNorCPUHeavy && approxWriteBytes > 0) || approxWriteBytes > 40*(1<<20) {
266+
if (neitherWriteNorCPUHeavy && beforeReplicationWriteBytes > 0) || beforeReplicationWriteBytes > 40*(1<<20) {
265267
_, _ = fmt.Fprintf(&buf, "WARNING: write load of %s is likely accidental\n",
266-
humanizeutil.IBytes(int64(approxWriteBytes)))
268+
humanizeutil.IBytes(int64(beforeReplicationWriteBytes)))
269+
}
270+
{
271+
var parts []string
272+
if accessVCPUs > 0 {
273+
parts = append(parts, fmt.Sprintf("%.2f access-vcpus", accessVCPUs))
274+
}
275+
if beforeReplicationRaftVCPUs > 0 {
276+
parts = append(parts, fmt.Sprintf("%.2f raft-vcpus", beforeReplicationRaftVCPUs))
277+
}
278+
if beforeReplicationWriteBytes > 0 {
279+
parts = append(parts, fmt.Sprintf("%s/s goodput", humanizeutil.IBytes(int64(beforeReplicationWriteBytes))))
280+
}
281+
if len(parts) > 0 {
282+
_, _ = fmt.Fprintln(&buf, strings.Join(parts, ", "))
283+
}
267284
}
268285

269286
var nextLoadGen gen.BasicLoad
@@ -560,8 +577,12 @@ func TestDataDriven(t *testing.T) {
560577
seedGen := rand.New(rand.NewSource(seed))
561578
for sample := 0; sample < samples; sample++ {
562579
tr := makeTraceHelper(rewrite, plotDir, testName, sample+1, duration)
563-
settingsGen.Settings.OnRecording = func(storeID int64, atDuration time.Duration, rec tracingpb.Recording) {
564-
tr.OnRecording(t, storeID, atDuration, rec)
580+
if tr.enabled {
581+
// Only populate OnRecording if we're going to save the results.
582+
// That way, we avoid creating trace spans during normal test runs.
583+
settingsGen.Settings.OnRecording = func(storeID int64, atDuration time.Duration, rec tracingpb.Recording) {
584+
tr.OnRecording(t, storeID, atDuration, rec)
585+
}
565586
}
566587

567588
assertionFailures := []string{}

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/heterogeneous_cpu.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ gen_ranges ranges=200 min_key=1 max_key=10000 placement_type=even
2121
# cpu nanos per second.
2222
gen_load rate=40000 rw_ratio=1 request_cpu_per_access=500000 min_key=1 max_key=10000
2323
----
24+
20.00 access-vcpus
2425

2526
# We want the CPU load to balance based on %cpu. But both the MMA and SMA balance
2627
# on absolute cpu-nanos, i.e. not taking into account that n3 has double the capacity.

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,20 @@ setting split_queue_enabled=false
2222
gen_ranges ranges=100 min_key=0 max_key=10000
2323
----
2424

25+
# TODO(tbg): likely accidentally too low.
2526
gen_load rate=5000 rw_ratio=0.95 min_block=100 max_block=100 request_cpu_per_access=100 raft_cpu_per_write=20 min_key=0 max_key=10000
2627
----
28+
0.00 access-vcpus, 0.00 raft-vcpus, 24 KiB/s goodput
2729

2830
# Another workload is added over the second half of the keyspace, which is initially
2931
# mostly on s1-s3.
3032
gen_ranges ranges=50 min_key=10001 max_key=20000 placement_type=skewed
3133
----
3234

35+
# TODO(tbg): likely accidentally too low.
3336
gen_load rate=5000 rw_ratio=0.95 min_block=128 max_block=128 request_cpu_per_access=100000 raft_cpu_per_write=20000 min_key=10001 max_key=20000
3437
----
38+
0.50 access-vcpus, 0.01 raft-vcpus, 31 KiB/s goodput
3539

3640
eval duration=2m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,cpu_util,replicas,leases)
3741
----

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_25nodes.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ setting split_queue_enabled=false
1212
gen_ranges ranges=50 min_key=0 max_key=10000
1313
----
1414

15+
# TODO(tbg): likely accidentally too low.
1516
gen_load rate=15000 rw_ratio=0.95 min_block=100 max_block=100 request_cpu_per_access=100 raft_cpu_per_write=20 min_key=0 max_key=10000
1617
----
18+
0.00 access-vcpus, 0.00 raft-vcpus, 73 KiB/s goodput
1719

1820
# Another workload is added over the second half of the keyspace, which is initially
1921
# only mainly on s1-s3 due to the skewed distribution.
@@ -22,6 +24,7 @@ gen_ranges ranges=50 min_key=10001 max_key=20000 placement_type=skewed
2224

2325
gen_load rate=15000 rw_ratio=0.95 min_block=1 max_block=1 request_cpu_per_access=7000000 raft_cpu_per_write=20000 min_key=10001 max_key=20000
2426
----
27+
105.00 access-vcpus, 0.02 raft-vcpus, 750 B/s goodput
2528

2629
eval duration=20m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,cpu_util,write_bytes_per_second,replicas,leases)
2730
----

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_able_to_shed_leases.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ gen_ranges ranges=25 min_key=0 max_key=10000 placement_type=replica_placement
2323

2424
gen_load rate=50000 rw_ratio=0 min_key=0 max_key=10000 raft_cpu_per_write=100000
2525
----
26+
5.00 raft-vcpus, 49 KiB/s goodput
2627

2728
eval duration=5m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,cpu_util,write_bytes_per_second,replicas,leases)
2829
----

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_unable_to_shed_leases.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ gen_ranges ranges=25 min_key=0 max_key=10000 placement_type=replica_placement
3535

3636
gen_load rate=5000 rw_ratio=0 min_key=0 max_key=10000 raft_cpu_per_write=1000000
3737
----
38+
5.00 raft-vcpus, 4.9 KiB/s goodput
3839

3940
eval duration=5m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,cpu_util,write_bytes_per_second,replicas,leases)
4041
----

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_write_uniform_cpu.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ gen_ranges ranges=30 min_key=1 max_key=10000 placement_type=even
1313

1414
gen_load rate=1000 rw_ratio=1.0 request_cpu_per_access=5000000 min_key=1 max_key=10000
1515
----
16+
5.00 access-vcpus
1617

1718
# Write only workload, which generates no CPU and 20000op/s*1000B/op =
1819
# 20000000B/s (x 3 replication factor) write bytes per second over the second half
@@ -22,6 +23,7 @@ gen_ranges ranges=30 min_key=10001 max_key=20000 placement_type=skewed
2223

2324
gen_load rate=20000 rw_ratio=0 min_block=1000 max_block=1000 min_key=10001 max_key=20000
2425
----
26+
19 MiB/s goodput
2527

2628
setting split_queue_enabled=false
2729
----

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/one_voter_skewed_cpu_skewed_write.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,17 @@ gen_ranges ranges=100 repl_factor=1 min_key=10001 max_key=20000 placement_type=r
1919
{s2:*}:1
2020

2121
# read cpu load of 1000x100=10k, all hitting s1, which is then at 100% cpu.
22+
# TODO(tbg): the CPU count is accidentally too low.
2223
gen_load rate=1000 rw_ratio=1.0 request_cpu_per_access=500000 min_key=1 max_key=10000
2324
----
25+
0.50 access-vcpus
2426

2527
# Write only workload, which generates 20% cpu and 5mb of writes per second.
2628
# over the second half of the keyspace.
29+
# TODO(tbg): the CPU load is too low.
2730
gen_load rate=5000 rw_ratio=0 min_block=1000 max_block=1000 raft_cpu_per_write=1 min_key=10001 max_key=20000
2831
----
32+
0.00 raft-vcpus, 4.8 MiB/s goodput
2933

3034
setting split_queue_enabled=false
3135
----

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_even_ranges_mma.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ gen_ranges ranges=36 min_key=1 max_key=10000 placement_type=replica_placement by
2020
# 5ms of request CPU per access and 500µs of raft CPU per write @ 1000/s.
2121
gen_load rate=1000 rw_ratio=0.95 min_block=100 max_block=100 request_cpu_per_access=5000000 raft_cpu_per_write=500000 min_key=1 max_key=10000
2222
----
23+
5.00 access-vcpus, 0.03 raft-vcpus, 4.9 KiB/s goodput
2324

2425
# Almost empty workload, which generates no CPU and small amount of writes
2526
# over the second half of the keyspace, scattered over s4-s9.
@@ -30,8 +31,10 @@ gen_ranges ranges=72 min_key=10001 max_key=20000 placement_type=replica_placemen
3031
{s4:*,s5,s6}:1
3132
{s7:*,s8,s9}:1
3233

34+
# TODO(tbg): this is barely anything, is this intentional?
3335
gen_load rate=100 rw_ratio=0 min_block=128 max_block=128 min_key=10001 max_key=20000
3436
----
37+
12 KiB/s goodput
3538

3639
setting split_queue_enabled=false
3740
----

0 commit comments

Comments
 (0)