Skip to content

Commit 0618bf0

Browse files
craig[bot]wenyihu6
andcommitted
Merge #153991
153991: asim: scale up cpu load in gen_load r=tbg a=wenyihu6 Epic: CRDB-49117 Release note: none --- **asim: scale up cpu load in gen_load** This commit adjusts some cpu load config for gen_load (e.g., request_cpu_per_access) to maintain a reasonable cpu utilization in some of the asim dd setup. Some values were too low since the beginning. But recent changes to scale up node_cpu_rate_capacity made things worse. This is important because mma only triggers cpu rebalancing when cpu util exceeds a certain threshold. https://github.com/cockroachdb/cockroach/blob/93f41d0e0dabdf54c7275581e0767452d6df5390/pkg/kv/kvserver/allocator/mmaprototype/load.go#L531 --- **asim: rename mma-and-count to mma-count** This commit renames mma-and-count to mma-count for consistency with sma-count. --- **asim: remove topology.txt** Previously, we stopped generating topology in asim tests but didn’t clean up the corresponding folder. This commit removes leftover topology files from generated. An open question remains on whether we should always clear the generated folder before writing new content. --- **asim: add mma-count mode to some tests** This commit adds mma-count mode to the relevant mma asim dd test setup, preparing for future work on a thrashing-prevention mechanism and its impact assessment. Co-authored-by: wenyihu6 <[email protected]>
2 parents 44124a9 + 77ba504 commit 0618bf0

22 files changed

+263
-139
lines changed

pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,7 @@ func TestDataDriven(t *testing.T) {
517517
})
518518
},
519519
// Both the replicate/lease queues and the MMA are enabled.
520-
"mma-and-count": func(eg *gen.StaticEvents) {
520+
"mma-count": func(eg *gen.StaticEvents) {
521521
eg.ScheduleEvent(settingsGen.Settings.StartTime, 0,
522522
event.SetSimulationSettingsEvent{
523523
IsClusterSetting: true,

pkg/kv/kvserver/asim/tests/testdata/generated/example_rebalancing/example_rebalancing_mma-only_1_topology.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

pkg/kv/kvserver/asim/tests/testdata/generated/example_rebalancing/example_rebalancing_mma-only_2_topology.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

pkg/kv/kvserver/asim/tests/testdata/generated/example_rebalancing/example_rebalancing_sma-count_1_topology.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

pkg/kv/kvserver/asim/tests/testdata/generated/example_rebalancing/example_rebalancing_sma-count_2_topology.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

pkg/kv/kvserver/asim/tests/testdata/non_rand/example_fulldisk.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ set_capacity store=5 capacity=107374182400
2020
# We will repeatedly hit the disk fullness threshold which causes shedding
2121
# replicas on store 5. We should see s5 hovering right around 92.5-95%
2222
# (the storage capacity threshold value).
23-
eval duration=30m seed=42 metrics=(replicas,disk_fraction_used) cfgs=(sma-count,mma-only,mma-and-count)
23+
eval duration=30m seed=42 metrics=(replicas,disk_fraction_used) cfgs=(sma-count,mma-only,mma-count)
2424
----
2525
disk_fraction_used#1: first: [s1=0.20, s2=0.20, s3=0.20, s4=0.20, s5=1.05] (stddev=0.34, mean=0.37, sum=2)
2626
disk_fraction_used#1: last: [s1=0.30, s2=0.30, s3=0.30, s4=0.30, s5=0.95] (stddev=0.26, mean=0.43, sum=2)
@@ -44,5 +44,5 @@ disk_fraction_used#1: thrash_pct: [s1=37%, s2=35%, s3=37%, s4=33%, s5=174%] (su
4444
replicas#1: first: [s1=300, s2=300, s3=300, s4=300, s5=300] (stddev=0.00, mean=300.00, sum=1500)
4545
replicas#1: last: [s1=327, s2=326, s3=328, s4=323, s5=196] (stddev=52.03, mean=300.00, sum=1500)
4646
replicas#1: thrash_pct: [s1=280%, s2=266%, s3=277%, s4=248%, s5=181%] (sum=1252%)
47-
artifacts[mma-and-count]: 2a17b81d800bb09
47+
artifacts[mma-count]: 2a17b81d800bb09
4848
==========================

pkg/kv/kvserver/asim/tests/testdata/non_rand/example_skewed_cpu_even_ranges_mma.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ setting split_queue_enabled=false
3434
# TODO(tbg): it's interesting that sma-only does better on write throughput than
3535
# mma-only. Looking at the graphs, the mma-only flavor is much slower in moving
3636
# load around. Possibly a bug?
37-
eval duration=35m samples=1 seed=42 cfgs=(sma-count,mma-only,mma-and-count) metrics=(cpu,cpu_util,leases,replicas,write_bytes_per_second)
37+
eval duration=35m samples=1 seed=42 cfgs=(sma-count,mma-only,mma-count) metrics=(cpu,cpu_util,leases,replicas,write_bytes_per_second)
3838
----
3939
cpu#1: last: [s1=984642922, s2=1126669982, s3=703798726, s4=419425213, s5=432156047, s6=559114683, s7=142318086, s8=283346689, s9=424209084] (stddev=303225920.55, mean=563964603.56, sum=5075681432)
4040
cpu#1: thrash_pct: [s1=14%, s2=49%, s3=47%, s4=9%, s5=9%, s6=11%, s7=4%, s8=14%, s9=25%] (sum=182%)
@@ -76,5 +76,5 @@ replicas#1: last: [s1=36, s2=34, s3=36, s4=37, s5=36, s6=36, s7=37, s8=36, s9=3
7676
replicas#1: thrash_pct: [s1=150%, s2=455%, s3=325%, s4=127%, s5=125%, s6=150%, s7=352%, s8=150%, s9=150%] (sum=1985%)
7777
write_bytes_per_second#1: last: [s1=5194, s2=5256, s3=5415, s4=6318, s5=6216, s6=6171, s7=6283, s8=6298, s9=6257] (stddev=461.69, mean=5934.22, sum=53408)
7878
write_bytes_per_second#1: thrash_pct: [s1=1721%, s2=1566%, s3=1458%, s4=714%, s5=542%, s6=570%, s7=1227%, s8=694%, s9=803%] (sum=9293%)
79-
artifacts[mma-and-count]: 9acfd49d51ee3d3c
79+
artifacts[mma-count]: 9acfd49d51ee3d3c
8080
==========================

pkg/kv/kvserver/asim/tests/testdata/non_rand/heterogeneous_cpu

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ assertion stat=cpu_util type=balance ticks=6 upper_bound=1.1
2121
----
2222
asserting: max_{stores}(cpu_util)/mean_{stores}(cpu_util) ≤ 1.10 at each of last 6 ticks
2323

24-
eval cfgs=(sma-count,mma-only) duration=10m metrics=(cpu,cpu_util)
24+
eval cfgs=(sma-count,mma-only,mma-count) duration=10m metrics=(cpu,cpu_util)
2525
----
2626
cpu#1: last: [s1=6700914166, s2=6696073333, s3=6603012499] (stddev=45053658.00, mean=6666666666.00, sum=19999999998)
2727
cpu#1: thrash_pct: [s1=185%, s2=170%, s3=189%] (sum=544%)
@@ -51,3 +51,17 @@ failed assertion sample 1
5151
max/mean=1.17 tick=4
5252
max/mean=1.17 tick=5
5353
==========================
54+
cpu#1: last: [s1=6397924999, s2=6398685833, s3=7202298517] (stddev=379006109.77, mean=6666303116.33, sum=19998909349)
55+
cpu#1: thrash_pct: [s1=33%, s2=37%, s3=40%] (sum=110%)
56+
cpu_util#1: last: [s1=0.80, s2=0.80, s3=0.45] (stddev=0.16, mean=0.68, sum=2)
57+
cpu_util#1: thrash_pct: [s1=8%, s2=9%, s3=5%] (sum=21%)
58+
artifacts[mma-count]: e783594cc55d9a0
59+
failed assertion sample 1
60+
balance stat=cpu_util threshold=(≤1.10) ticks=6
61+
max/mean=1.17 tick=0
62+
max/mean=1.17 tick=1
63+
max/mean=1.17 tick=2
64+
max/mean=1.17 tick=3
65+
max/mean=1.17 tick=4
66+
max/mean=1.17 tick=5
67+
==========================

pkg/kv/kvserver/asim/tests/testdata/non_rand/load_distribution_movement_disabled_enable_later.txt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ gen_ranges ranges=30 min_key=10001 max_key=20000 placement_type=replica_placemen
3434
{s10:*,s6,s7}:1
3535

3636
# Read-only workload on first 30 ranges. 5 cores.
37-
gen_load rate=1000000 rw_ratio=1 request_cpu_per_access=5000 min_key=1 max_key=10000
37+
gen_load rate=1000000 rw_ratio=1 request_cpu_per_access=50000 min_key=1 max_key=10000
3838
----
3939

4040
# Write only workload on second 30 ranges. 10mb/s before 3x replication.
@@ -47,17 +47,17 @@ setting rebalance_mode=4 delay=2m
4747

4848
eval duration=7m samples=1 seed=42 cfgs=(sma-count) metrics=(cpu,cpu_util,write_bytes_per_second,replicas,leases)
4949
----
50-
cpu#1: last: [s1=451007155, s2=424066775, s3=481923720, s4=487552810, s5=432089126, s6=536127090, s7=582915883, s8=497226435, s9=545078443, s10=559276708] (stddev=51954473.68, mean=499726414.50, sum=4997264145)
51-
cpu#1: thrash_pct: [s1=19%, s2=33%, s3=40%, s4=24%, s5=15%, s6=16%, s7=14%, s8=7%, s9=12%, s10=14%] (sum=193%)
52-
cpu_util#1: last: [s1=0.03, s2=0.03, s3=0.03, s4=0.03, s5=0.03, s6=0.03, s7=0.04, s8=0.03, s9=0.03, s10=0.03] (stddev=0.00, mean=0.03, sum=0)
53-
cpu_util#1: thrash_pct: [s1=19%, s2=33%, s3=40%, s4=24%, s5=15%, s6=16%, s7=14%, s8=7%, s9=12%, s10=14%] (sum=193%)
50+
cpu#1: last: [s1=5081867763, s2=4625570068, s3=4683450985, s4=5175660716, s5=4480723281, s6=5400432969, s7=5232941604, s8=5060159898, s9=5285669166, s10=4934994414] (stddev=291808172.88, mean=4996147086.40, sum=49961470864)
51+
cpu#1: thrash_pct: [s1=29%, s2=33%, s3=54%, s4=33%, s5=22%, s6=6%, s7=35%, s8=14%, s9=18%, s10=32%] (sum=275%)
52+
cpu_util#1: last: [s1=0.32, s2=0.29, s3=0.29, s4=0.32, s5=0.28, s6=0.34, s7=0.33, s8=0.32, s9=0.33, s10=0.31] (stddev=0.02, mean=0.31, sum=3)
53+
cpu_util#1: thrash_pct: [s1=29%, s2=33%, s3=54%, s4=33%, s5=22%, s6=6%, s7=35%, s8=14%, s9=18%, s10=32%] (sum=275%)
5454
leases#1: first: [s1=6, s2=6, s3=6, s4=6, s5=6, s6=6, s7=6, s8=6, s9=6, s10=6] (stddev=0.00, mean=6.00, sum=60)
55-
leases#1: last: [s1=54, s2=56, s3=55, s4=53, s5=57, s6=58, s7=60, s8=57, s9=60, s10=61] (stddev=2.55, mean=57.10, sum=571)
56-
leases#1: thrash_pct: [s1=106%, s2=118%, s3=95%, s4=95%, s5=123%, s6=33%, s7=33%, s8=27%, s9=27%, s10=30%] (sum=688%)
55+
leases#1: last: [s1=56, s2=56, s3=56, s4=56, s5=55, s6=59, s7=58, s8=62, s9=58, s10=55] (stddev=2.07, mean=57.10, sum=571)
56+
leases#1: thrash_pct: [s1=115%, s2=115%, s3=120%, s4=101%, s5=140%, s6=24%, s7=53%, s8=30%, s9=33%, s10=56%] (sum=787%)
5757
replicas#1: first: [s1=18, s2=18, s3=18, s4=18, s5=18, s6=18, s7=18, s8=18, s9=18, s10=18] (stddev=0.00, mean=18.00, sum=180)
58-
replicas#1: last: [s1=175, s2=172, s3=179, s4=173, s5=173, s6=174, s7=167, s8=169, s9=169, s10=162] (stddev=4.50, mean=171.30, sum=1713)
59-
replicas#1: thrash_pct: [s1=87%, s2=91%, s3=92%, s4=93%, s5=98%, s6=16%, s7=15%, s8=14%, s9=15%, s10=10%] (sum=531%)
60-
write_bytes_per_second#1: last: [s1=2994621, s2=2991747, s3=3008247, s4=2988902, s5=3006699, s6=2993336, s7=2993993, s8=3002728, s9=2996629, s10=2997216] (stddev=6108.70, mean=2997411.80, sum=29974118)
61-
write_bytes_per_second#1: thrash_pct: [s1=73%, s2=40%, s3=85%, s4=74%, s5=5%, s6=160%, s7=207%, s8=103%, s9=114%, s10=55%] (sum=916%)
62-
artifacts[sma-count]: ae2919ade8043626
58+
replicas#1: last: [s1=170, s2=176, s3=174, s4=175, s5=172, s6=166, s7=172, s8=170, s9=170, s10=168] (stddev=2.97, mean=171.30, sum=1713)
59+
replicas#1: thrash_pct: [s1=91%, s2=91%, s3=105%, s4=93%, s5=102%, s6=16%, s7=17%, s8=18%, s9=15%, s10=16%] (sum=564%)
60+
write_bytes_per_second#1: last: [s1=2661323, s2=2656875, s3=4334085, s4=3343904, s5=2658933, s6=2329852, s7=2329103, s8=3684224, s9=3345882, s10=2662193] (stddev=620316.35, mean=3000637.40, sum=30006374)
61+
write_bytes_per_second#1: thrash_pct: [s1=128%, s2=93%, s3=155%, s4=167%, s5=79%, s6=173%, s7=220%, s8=204%, s9=169%, s10=169%] (sum=1557%)
62+
artifacts[sma-count]: b796f1485279b21d
6363
==========================

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma_constraint_satisfaction1.txt

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ gen_ranges ranges=100 repl_factor=5 placement_type=replica_placement
1818
----
1919
{s1:*,s2,s4,s5,s6}:1
2020

21-
gen_load rate=1000 rw_ratio=0.95 min_block=1000 max_block=1000 request_cpu_per_access=100000 raft_cpu_per_write=10000
21+
gen_load rate=1000 rw_ratio=0.95 min_block=1000 max_block=1000 request_cpu_per_access=5000000 raft_cpu_per_write=10000
2222
----
2323

2424
set_span_config
@@ -28,17 +28,29 @@ set_span_config
2828
setting split_queue_enabled=false
2929
----
3030

31-
eval duration=40m samples=1 seed=42 cfgs=(mma-only) metrics=(cpu,cpu_util,leases,replicas)
31+
eval duration=40m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,cpu_util,leases,replicas)
3232
----
33-
cpu#1: last: [s1=100533499, s2=500166, s3=0, s4=500166, s5=500166, s6=500166, s7=0, s8=0, s9=0] (stddev=31516919.07, mean=11392684.78, sum=102534163)
34-
cpu#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=0%)
35-
cpu_util#1: last: [s1=0.02, s2=0.00, s3=0.00, s4=0.00, s5=0.00, s6=0.00, s7=0.00, s8=0.00, s9=0.00] (stddev=0.01, mean=0.00, sum=0)
36-
cpu_util#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=0%)
33+
cpu#1: last: [s1=1796346149, s2=1601160605, s3=1602033353, s4=499258, s5=499258, s6=499258, s7=0, s8=0, s9=0] (stddev=787270621.50, mean=555670875.67, sum=5001037881)
34+
cpu#1: thrash_pct: [s1=8%, s2=31%, s3=31%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=70%)
35+
cpu_util#1: last: [s1=0.36, s2=0.32, s3=0.32, s4=0.00, s5=0.00, s6=0.00, s7=0.00, s8=0.00, s9=0.00] (stddev=0.16, mean=0.11, sum=1)
36+
cpu_util#1: thrash_pct: [s1=8%, s2=31%, s3=31%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=70%)
3737
leases#1: first: [s1=100, s2=0, s3=0, s4=0, s5=0, s6=0, s7=0, s8=0, s9=0] (stddev=31.43, mean=11.11, sum=100)
38-
leases#1: last: [s1=100, s2=0, s3=0, s4=0, s5=0, s6=0, s7=0, s8=0, s9=0] (stddev=31.43, mean=11.11, sum=100)
38+
leases#1: last: [s1=36, s2=32, s3=32, s4=0, s5=0, s6=0, s7=0, s8=0, s9=0] (stddev=15.75, mean=11.11, sum=100)
3939
leases#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=0%)
4040
replicas#1: first: [s1=100, s2=100, s3=0, s4=100, s5=100, s6=100, s7=0, s8=0, s9=0] (stddev=49.69, mean=55.56, sum=500)
41-
replicas#1: last: [s1=100, s2=100, s3=0, s4=100, s5=100, s6=100, s7=0, s8=0, s9=0] (stddev=49.69, mean=55.56, sum=500)
41+
replicas#1: last: [s1=68, s2=100, s3=32, s4=100, s5=100, s6=100, s7=0, s8=0, s9=0] (stddev=44.56, mean=55.56, sum=500)
4242
replicas#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=0%)
43-
artifacts[mma-only]: 621f3d4f69649ede
43+
artifacts[mma-only]: 29c929f977fd070d
44+
==========================
45+
cpu#1: last: [s1=1802985700, s2=1599804035, s3=1599681058, s4=499781, s5=499781, s6=499781, s7=0, s8=0, s9=0] (stddev=787888869.08, mean=555996681.78, sum=5003970136)
46+
cpu#1: thrash_pct: [s1=13%, s2=21%, s3=36%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=70%)
47+
cpu_util#1: last: [s1=0.36, s2=0.32, s3=0.32, s4=0.00, s5=0.00, s6=0.00, s7=0.00, s8=0.00, s9=0.00] (stddev=0.16, mean=0.11, sum=1)
48+
cpu_util#1: thrash_pct: [s1=13%, s2=21%, s3=36%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=70%)
49+
leases#1: first: [s1=100, s2=0, s3=0, s4=0, s5=0, s6=0, s7=0, s8=0, s9=0] (stddev=31.43, mean=11.11, sum=100)
50+
leases#1: last: [s1=36, s2=32, s3=32, s4=0, s5=0, s6=0, s7=0, s8=0, s9=0] (stddev=15.75, mean=11.11, sum=100)
51+
leases#1: thrash_pct: [s1=0%, s2=0%, s3=17%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=17%)
52+
replicas#1: first: [s1=100, s2=100, s3=0, s4=100, s5=100, s6=100, s7=0, s8=0, s9=0] (stddev=49.69, mean=55.56, sum=500)
53+
replicas#1: last: [s1=67, s2=69, s3=64, s4=100, s5=100, s6=100, s7=0, s8=0, s9=0] (stddev=41.59, mean=55.56, sum=500)
54+
replicas#1: thrash_pct: [s1=3%, s2=6%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%] (sum=9%)
55+
artifacts[mma-count]: 1c29997c0b722745
4456
==========================

0 commit comments

Comments
 (0)