1
+ # This test verifies that the allocator can rebalance replicas and leases when
2
+ # there is high cpu load imbalance across the cluster. The test sets up a 10-node
3
+ # cluster with two distinct workloads: one evenly distributed across all nodes,
4
+ # and another high-cpu workload initially concentrated on only the first few nodes
5
+ # due to skewed placement. The second workload has significantly higher cpu cost
6
+ # per op, creating cpu imbalance.
7
+ #
8
+ # Expected outcome: The allocator should rebalance both replicas and leases to
9
+ # distribute the high-cpu workload more evenly across all 10 nodes.
1
10
gen_cluster nodes=10 node_cpu_rate_capacity=8000000000
2
11
----
3
12
13
+ # TODO(wenyihu6): why didn't we balance more replicas/leases - is it because of a very high cpu per range
14
+
4
15
# Set the rebalance mode to use the mma store rebalancer and disable the lease
5
16
# and replicate queues so that only the mma store rebalancer is moving replicas
6
17
# or leases.
@@ -15,36 +26,36 @@ gen_load rate=5000 rw_ratio=0.95 min_block=100 max_block=100 request_cpu_per_acc
15
26
----
16
27
17
28
# Another workload is added over the second half of the keyspace, which is initially
18
- # only on s1-s3.
29
+ # mostly on s1-s3.
19
30
gen_ranges ranges=50 min_key=10001 max_key=20000 placement_type=skewed
20
31
----
21
32
22
33
gen_load rate=5000 rw_ratio=0.95 min_block=128 max_block=128 request_cpu_per_access=100000 raft_cpu_per_write=20000 min_key=10001 max_key=20000
23
34
----
24
35
25
- eval duration=15m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,write_bytes_per_second ,replicas,leases)
36
+ eval duration=2m samples=1 seed=42 cfgs=(mma-only,mma-count) metrics=(cpu,cpu_util ,replicas,leases)
26
37
----
27
- cpu#1: last: [s1=274870057, s2=124118783, s3=42166496, s4=21298975, s5=10805903, s6=10577758, s7=453407, s8=10306222, s9=10413474, s10=10503921] (stddev=81956672.84, mean=51551499.60, sum=515514996)
28
- cpu#1: thrash_pct: [s1=7%, s2=6%, s3=4%, s4=3%, s5=2%, s6=2%, s7=0%, s8=2%, s9=2%, s10=2%] (sum=30%)
38
+ cpu#1: last: [s1=275096159, s2=123983362, s3=41814276, s4=21433672, s5=10796253, s6=10602552, s7=439843, s8=10300378, s9=10452776, s10=10595723] (stddev=81999286.66, mean=51551499.40, sum=515514994)
39
+ cpu#1: thrash_pct: [s1=4%, s2=3%, s3=3%, s4=2%, s5=1%, s6=1%, s7=0%, s8=1%, s9=1%, s10=1%] (sum=18%)
40
+ cpu_util#1: last: [s1=0.03, s2=0.02, s3=0.01, s4=0.00, s5=0.00, s6=0.00, s7=0.00, s8=0.00, s9=0.00, s10=0.00] (stddev=0.01, mean=0.01, sum=0)
41
+ cpu_util#1: thrash_pct: [s1=4%, s2=3%, s3=3%, s4=2%, s5=1%, s6=1%, s7=0%, s8=1%, s9=1%, s10=1%] (sum=18%)
29
42
leases#1: first: [s1=37, s2=22, s3=14, s4=13, s5=11, s6=11, s7=10, s8=11, s9=10, s10=11] (stddev=8.07, mean=15.00, sum=150)
30
43
leases#1: last: [s1=37, s2=22, s3=14, s4=13, s5=11, s6=11, s7=10, s8=11, s9=10, s10=11] (stddev=8.07, mean=15.00, sum=150)
31
44
leases#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%, s10=0%] (sum=0%)
32
45
replicas#1: first: [s1=80, s2=70, s3=51, s4=42, s5=37, s6=35, s7=34, s8=33, s9=34, s10=34] (stddev=16.02, mean=45.00, sum=450)
33
46
replicas#1: last: [s1=80, s2=70, s3=51, s4=42, s5=37, s6=35, s7=34, s8=33, s9=34, s10=34] (stddev=16.02, mean=45.00, sum=450)
34
47
replicas#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%, s10=0%] (sum=0%)
35
- write_bytes_per_second#1: last: [s1=39511, s2=33080, s3=20899, s4=15208, s5=11942, s6=10699, s7=10093, s8=9465, s9=10055, s10=10043] (stddev=10247.09, mean=17099.50, sum=170995)
36
- write_bytes_per_second#1: thrash_pct: [s1=13%, s2=18%, s3=20%, s4=19%, s5=17%, s6=16%, s7=14%, s8=14%, s9=17%, s10=16%] (sum=165%)
37
- artifacts[mma-only]: bd71a8872f557e0f
48
+ artifacts[mma-only]: c9c14a2b21947e75
38
49
==========================
39
- cpu#1: last: [s1=153545974, s2=82571497, s3=61967377, s4=31436939, s5=21209665, s6=31257441, s7=10903219, s8=40903888, s9=51026201, s10=30714935] (stddev=39256550.41, mean=51553713.60, sum=515537136)
40
- cpu#1: thrash_pct: [s1=10%, s2=6%, s3=7%, s4=5%, s5=4%, s6=6%, s7=3%, s8=6%, s9=7%, s10=5%] (sum=58%)
50
+ cpu#1: last: [s1=153767559, s2=82526536, s3=61655396, s4=31442666, s5=21243662, s6=31483931, s7=10725049, s8=40802943, s9=51247053, s10=30866698] (stddev=39300865.24, mean=51576149.30, sum=515761493)
51
+ cpu#1: thrash_pct: [s1=6%, s2=4%, s3=4%, s4=3%, s5=2%, s6=4%, s7=1%, s8=4%, s9=5%, s10=3%] (sum=37%)
52
+ cpu_util#1: last: [s1=0.02, s2=0.01, s3=0.01, s4=0.00, s5=0.00, s6=0.00, s7=0.00, s8=0.01, s9=0.01, s10=0.00] (stddev=0.00, mean=0.01, sum=0)
53
+ cpu_util#1: thrash_pct: [s1=6%, s2=4%, s3=4%, s4=3%, s5=2%, s6=4%, s7=1%, s8=4%, s9=5%, s10=3%] (sum=37%)
41
54
leases#1: first: [s1=37, s2=22, s3=14, s4=13, s5=11, s6=11, s7=10, s8=11, s9=10, s10=11] (stddev=8.07, mean=15.00, sum=150)
42
55
leases#1: last: [s1=20, s2=16, s3=15, s4=16, s5=12, s6=14, s7=12, s8=15, s9=15, s10=15] (stddev=2.14, mean=15.00, sum=150)
43
56
leases#1: thrash_pct: [s1=0%, s2=0%, s3=15%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%, s10=0%] (sum=15%)
44
57
replicas#1: first: [s1=80, s2=70, s3=51, s4=42, s5=37, s6=35, s7=34, s8=33, s9=34, s10=34] (stddev=16.02, mean=45.00, sum=450)
45
58
replicas#1: last: [s1=45, s2=44, s3=44, s4=47, s5=44, s6=44, s7=44, s8=45, s9=46, s10=47] (stddev=1.18, mean=45.00, sum=450)
46
59
replicas#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%, s8=0%, s9=0%, s10=0%] (sum=0%)
47
- write_bytes_per_second#1: last: [s1=25330, s2=20719, s3=18391, s4=17246, s5=15288, s6=15257, s7=14520, s8=14450, s9=15382, s10=14423] (stddev=3361.47, mean=17100.60, sum=171006)
48
- write_bytes_per_second#1: thrash_pct: [s1=84%, s2=62%, s3=67%, s4=35%, s5=45%, s6=38%, s7=29%, s8=29%, s9=42%, s10=40%] (sum=471%)
49
- artifacts[mma-count]: abbd0fc9dbc1971a
60
+ artifacts[mma-count]: de0b265129d19e1
50
61
==========================
0 commit comments