From 8d229689ba25caddd3bca09e6325293c1a92d3e6 Mon Sep 17 00:00:00 2001 From: Angela Dietz Date: Thu, 20 Nov 2025 12:38:34 -0500 Subject: [PATCH] asim: specify node capacity in cores, not nanos/sec Previously, the datadriven asim tests expressed node capacity in nanoseconds/core which are difficult to read due to lots of zeros. Now, the node capacity is expressed in cores (float) which is much more readable and less prone to user error. For example, `node_cpu_rate_capacity=8000000000` is now expressed as `node_cpu_cores=8`. Fixes #156845 Epic: CRDB-55052 Release note: none. --- pkg/kv/kvserver/asim/config/settings.go | 11 ++-- pkg/kv/kvserver/asim/state/BUILD.bazel | 2 + pkg/kv/kvserver/asim/state/node_cpu_cores.go | 19 +++++++ .../asim/state/node_cpu_cores_test.go | 52 +++++++++++++++++++ .../asim/tests/datadriven_simulation_test.go | 20 +++---- .../non_rand/mma/heterogeneous_cpu.txt | 2 +- .../tests/testdata/non_rand/mma/high_cpu.txt | 2 +- .../non_rand/mma/high_cpu_25nodes.txt | 2 +- .../mma/high_cpu_able_to_shed_leases.txt | 2 +- .../mma/high_cpu_unable_to_shed_leases.txt | 2 +- .../non_rand/mma/high_write_uniform_cpu.txt | 2 +- .../mma/one_voter_skewed_cpu_skewed_write.txt | 2 +- .../mma/skewed_cpu_even_ranges_mma.txt | 2 +- .../non_rand/mma/skewed_cpu_skewed_write.txt | 2 +- .../skewed_cpu_skewed_write_more_ranges.txt | 2 +- 15 files changed, 100 insertions(+), 24 deletions(-) create mode 100644 pkg/kv/kvserver/asim/state/node_cpu_cores.go create mode 100644 pkg/kv/kvserver/asim/state/node_cpu_cores_test.go diff --git a/pkg/kv/kvserver/asim/config/settings.go b/pkg/kv/kvserver/asim/config/settings.go index bd0d51f6ba61..18a6080266c0 100644 --- a/pkg/kv/kvserver/asim/config/settings.go +++ b/pkg/kv/kvserver/asim/config/settings.go @@ -31,10 +31,13 @@ const ( defaultLBRebalancingInterval = time.Minute ) -const DefaultNodeCPURateCapacityNanos = 8 * 1e9 // 8 vcpus -const DefaultStoreDiskCapacityBytes = 1024 << 30 // 1024 GiB -const DoubleDefaultNodeCPURateCapacityNanos = 16 * 1e9 // 16 vcpus -const DoubleDefaultStoreDiskCapacityBytes = 2048 << 30 // 2048 GiB +const ( + DefaultNodeCPUCores = 8.0 // 8 vcpus + DefaultNodeCPURateCapacityNanos = DefaultNodeCPUCores * 1e9 // 8 vcpus + DefaultStoreDiskCapacityBytes = 1024 << 30 // 1024 GiB + DoubleDefaultNodeCPURateCapacityNanos = DefaultNodeCPURateCapacityNanos * 2 // 16 vcpus + DoubleDefaultStoreDiskCapacityBytes = 2048 << 30 // 2048 GiB +) var ( // DefaultStartTime is used as the default beginning time for simulation diff --git a/pkg/kv/kvserver/asim/state/BUILD.bazel b/pkg/kv/kvserver/asim/state/BUILD.bazel index 24da0d71aadf..efe80dd93f45 100644 --- a/pkg/kv/kvserver/asim/state/BUILD.bazel +++ b/pkg/kv/kvserver/asim/state/BUILD.bazel @@ -11,6 +11,7 @@ go_library( "load.go", "new_state.go", "new_state_test_helper.go", + "node_cpu_cores.go", "node_cpu_rate_capacities.go", "parser_replica_placement.go", "split_decider.go", @@ -60,6 +61,7 @@ go_test( "change_test.go", "config_loader_test.go", "liveness_test.go", + "node_cpu_cores_test.go", "node_cpu_rate_capacities_test.go", "parser_replica_placement_test.go", "split_decider_test.go", diff --git a/pkg/kv/kvserver/asim/state/node_cpu_cores.go b/pkg/kv/kvserver/asim/state/node_cpu_cores.go new file mode 100644 index 000000000000..1ee5cf7e7453 --- /dev/null +++ b/pkg/kv/kvserver/asim/state/node_cpu_cores.go @@ -0,0 +1,19 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package state + +type NodeCPUCores []float64 + +// ToRateCapacityNanos converts NodeCPUCores to node capacities in nanos +// (NodeCPURateCapacities). +func (nc NodeCPUCores) ToRateCapacityNanos() NodeCPURateCapacities { + res := make(NodeCPURateCapacities, len(nc)) + const nanosPerSecond = 1e9 + for i, cores := range nc { + res[i] = uint64(cores * nanosPerSecond) + } + return res +} diff --git a/pkg/kv/kvserver/asim/state/node_cpu_cores_test.go b/pkg/kv/kvserver/asim/state/node_cpu_cores_test.go new file mode 100644 index 000000000000..542068ec4d61 --- /dev/null +++ b/pkg/kv/kvserver/asim/state/node_cpu_cores_test.go @@ -0,0 +1,52 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package state + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNodeCPUCores_ToRateCapacityNanos(t *testing.T) { + testCases := []struct { + name string + cores NodeCPUCores + expected NodeCPURateCapacities + }{ + { + name: "empty", + cores: NodeCPUCores{}, + expected: NodeCPURateCapacities{}, + }, + { + name: "single_core", + cores: NodeCPUCores{1.0}, + expected: NodeCPURateCapacities{1e9}, + }, + { + name: "multiple_cores", + cores: NodeCPUCores{1.0, 2.0, 3.0}, + expected: NodeCPURateCapacities{1e9, 2e9, 3e9}, + }, + { + name: "fractional_cores", + cores: NodeCPUCores{1.5, 2.75, 3.25}, + expected: NodeCPURateCapacities{1500e6, 2750e6, 3250e6}, + }, + { + name: "round_down_to_nearest_nanosecond", + cores: NodeCPUCores{1.4999999998, 2.1111111111999}, + expected: NodeCPURateCapacities{1499999999, 2111111111}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.expected, tc.cores.ToRateCapacityNanos()) + }) + } +} diff --git a/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go b/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go index 93ad95f3faa9..74f72c962761 100644 --- a/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go +++ b/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go @@ -61,11 +61,11 @@ var runAsimTests = envutil.EnvOrDefaultBool("COCKROACH_RUN_ASIM_TESTS", false) // cpu_per_access=0 raft_cpu_per_write=0 // // - "gen_cluster" [nodes=] [stores_per_node=] -// [store_byte_capacity_gib=] [node_cpu_rate_capacity=] +// [store_byte_capacity_gib=] [node_cpu_cores=] // Initialize the cluster generator parameters. On the next call to eval, // the cluster generator is called to create the initial state used in the // simulation. The default values are: nodes=3 stores_per_node=1 -// store_byte_capacity_gib=256, node_cpu_rate_capacity=0. +// store_byte_capacity_gib=256, node_cpu_cores=8.0. // // - "load_cluster": config= // Load a defined cluster configuration to be the generated cluster in the @@ -342,7 +342,7 @@ func TestDataDriven(t *testing.T) { case "gen_cluster": var nodes = 3 var storesPerNode = 1 - var nodeCPURateCapacity = []uint64{config.DefaultNodeCPURateCapacityNanos} + var nodeCPUCores = []float64{config.DefaultNodeCPUCores} var region []string var nodesPerRegion []int var storeByteCapacityGiB int64 = 256 @@ -351,16 +351,16 @@ func TestDataDriven(t *testing.T) { scanIfExists(t, d, "store_byte_capacity_gib", &storeByteCapacityGiB) scanIfExists(t, d, "region", ®ion) scanIfExists(t, d, "nodes_per_region", &nodesPerRegion) - scanIfExists(t, d, "node_cpu_rate_capacity", &nodeCPURateCapacity) + scanIfExists(t, d, "node_cpu_cores", &nodeCPUCores) var buf strings.Builder - require.NotEmpty(t, nodeCPURateCapacity) + require.NotEmpty(t, nodeCPUCores) { - n := len(nodeCPURateCapacity) - require.True(t, n == 1 || n == nodes, "need to specify node_cpu_rate_capacity for each node") + n := len(nodeCPUCores) + require.True(t, n == 1 || n == nodes, "need to specify node_cpu_cores for each node") - for _, cpct := range nodeCPURateCapacity { - if cores := float64(cpct) / 1e9; cores < 1 { + for _, cores := range nodeCPUCores { + if cores < 1.0 { // TODO(mma): fix up the tests that trigger this warning. // TODO(mma): print a warning whenever the measured CPU utilization // on a node exceeds this capacity, as that's likely not what the test @@ -377,7 +377,7 @@ func TestDataDriven(t *testing.T) { StoreByteCapacity: storeByteCapacityGiB << 30, Region: region, NodesPerRegion: nodesPerRegion, - NodeCPURateCapacity: nodeCPURateCapacity, + NodeCPURateCapacity: state.NodeCPUCores(nodeCPUCores).ToRateCapacityNanos(), } return buf.String() case "load_cluster": diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/heterogeneous_cpu.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/heterogeneous_cpu.txt index a4a0b66f6347..13a8a638bc8a 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/heterogeneous_cpu.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/heterogeneous_cpu.txt @@ -9,7 +9,7 @@ # balance on absolute cpu-nanos. n3 should handle more load due to its higher # capacity, but the current implementation doesn't account for this. This is # tracked in issue: https://github.com/cockroachdb/cockroach/issues/153777. -gen_cluster nodes=3 node_cpu_rate_capacity=(8000000000,8000000000,16000000000) +gen_cluster nodes=3 node_cpu_cores=(8,8,16) ---- gen_ranges ranges=200 min_key=1 max_key=10000 placement_type=even diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu.txt index 7edc0623d878..bb16931d7cb4 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu.txt @@ -7,7 +7,7 @@ # # Expected outcome: The allocator should rebalance both replicas and leases to # distribute the high-cpu workload more evenly across all 10 nodes. -gen_cluster nodes=10 node_cpu_rate_capacity=8000000000 +gen_cluster nodes=10 node_cpu_cores=8 ---- # TODO(wenyihu6): why didn't we balance more replicas/leases - is it because of a very high cpu per range diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_25nodes.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_25nodes.txt index 6b197ef730a0..d9798387ed43 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_25nodes.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_25nodes.txt @@ -2,7 +2,7 @@ # there is high CPU load imbalance across a large cluster. The test set-up is # similar to high_cpu.txt but is on 25 nodes and with 3x the load for two # gen_load commands. -gen_cluster nodes=25 node_cpu_rate_capacity=8000000000 +gen_cluster nodes=25 node_cpu_cores=8 ---- setting split_queue_enabled=false diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_able_to_shed_leases.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_able_to_shed_leases.txt index 61d828461849..1f75bae60625 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_able_to_shed_leases.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_able_to_shed_leases.txt @@ -4,7 +4,7 @@ # will be able to shed its own leases because it is the leaseholer. There should # be a period of lease-rebalancing activity before replica-rebalancing. -gen_cluster nodes=5 node_cpu_rate_capacity=9000000000 +gen_cluster nodes=5 node_cpu_cores=9 ---- setting split_queue_enabled=false diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_unable_to_shed_leases.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_unable_to_shed_leases.txt index 1a186ac39a63..be4fd5d2e999 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_unable_to_shed_leases.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_cpu_unable_to_shed_leases.txt @@ -16,7 +16,7 @@ # the CPU overloaded s1, so we should observe a period of lease transfers before # any replica based rebalancing away from the store occurs. -gen_cluster nodes=5 node_cpu_rate_capacity=9000000000 +gen_cluster nodes=5 node_cpu_cores=9 ---- setting split_queue_enabled=false diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_write_uniform_cpu.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_write_uniform_cpu.txt index 15ee2c6dadb3..8d696f3fd2be 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_write_uniform_cpu.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/high_write_uniform_cpu.txt @@ -4,7 +4,7 @@ # # Expected outcome: mma should rebalance replicas and leases to distribute the # cpu load and write load more evenly across all stores. -gen_cluster nodes=10 node_cpu_rate_capacity=3000000000 stores_per_node=2 +gen_cluster nodes=10 node_cpu_cores=3 stores_per_node=2 ---- # Read only workload, which generates 1000 request cpu nanos/s evenly over diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/one_voter_skewed_cpu_skewed_write.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/one_voter_skewed_cpu_skewed_write.txt index 5c4c2c66d92e..4ccb1b86650c 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/one_voter_skewed_cpu_skewed_write.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/one_voter_skewed_cpu_skewed_write.txt @@ -5,7 +5,7 @@ # # Expected outcome: two stores should roughly equalize their cpu load and write # load via range rebalancing. -gen_cluster nodes=2 node_cpu_rate_capacity=1000000000 +gen_cluster nodes=2 node_cpu_cores=1 ---- gen_ranges ranges=100 repl_factor=1 min_key=1 max_key=10000 placement_type=replica_placement bytes_mib=26 diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_even_ranges_mma.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_even_ranges_mma.txt index 1ededabaf81d..e5f5fdf9a672 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_even_ranges_mma.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_even_ranges_mma.txt @@ -6,7 +6,7 @@ # # Expected outcome: The allocator should rebalance both leases and replicas to # achieve more even cpu and write distribution across all nodes. -gen_cluster nodes=9 node_cpu_rate_capacity=5000000000 +gen_cluster nodes=9 node_cpu_cores=5 ---- # The placement will be skewed, s.t. n1/s1, n2/s2 and n3/s3 will have all the diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write.txt index ceb6608f78c9..a9c28b8ee76d 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write.txt @@ -12,7 +12,7 @@ # ignoreLevel logic in rebalanceStores with the grace duration to start # shedding more aggressively and other related changes have made this much # better. -gen_cluster nodes=6 node_cpu_rate_capacity=5000000000 +gen_cluster nodes=6 node_cpu_cores=5 ---- # The placement will be skewed, s.t. n1/s1, n2/s2 and n3/s3 will have all the diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write_more_ranges.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write_more_ranges.txt index b5f2243779f5..891fbf6576c4 100644 --- a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write_more_ranges.txt +++ b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/skewed_cpu_skewed_write_more_ranges.txt @@ -3,7 +3,7 @@ # # Expected outcome: The allocator should rebalance both cpu and write load across # all stores, with mma achieving better results than sma. -gen_cluster nodes=6 node_cpu_rate_capacity=5000000000 +gen_cluster nodes=6 node_cpu_cores=5 ---- # The placement will be skewed, s.t. n1/s1, n2/s2 and n3/s3 will have all the