Skip to content

Commit a4e63be

Browse files
craig[bot]wenyihu6
andcommitted
Merge #153757
153757: asim: record cpu utilization in StoreMetrics r=tbg a=wenyihu6 Resolves: #153742 Release note: none --- **asim: plumb NodeCPURateCapacity everywhere** Previously, node CPU rate capacity wasn’t consistently plumbed, and some tests treated it as optional. This commit ensures it is passed through everywhere, preventing misconfigurations from being silently missed. --- **asim: record cpu utilization in StoreMetrics** Previously, StoreMetrics didn't include CPU utilization, making it hard to evaluate the simulation setup. This commit adds CPU utilization to the store metrics. Note that due to the lack of node-level metrics, the same CPU capacity will be repeated for stores on the same node. --- **asim: move SetNodeLocality & s.SetNodeCPURateCapacity to AddNode** Previously, s.SetNodeLocality and s.SetNodeCPURateCapacity were called after s.AddNode, making it easy to miss populating these fields. This commit updates s.AddNode to take CPU rate capacity and node capacity explicitly as arguments. Co-authored-by: wenyihu6 <[email protected]>
2 parents cd189e8 + 3f8df3e commit a4e63be

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+181
-136
lines changed

pkg/kv/kvserver/asim/config/settings.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ const (
3232
defaultLBRebalancingObjective = 0 // QPS
3333
)
3434

35+
const DefaultNodeCPURateCapacityNanos = 8 * 1e9 // 8 vcpus
36+
const DefaultStoreDiskCapacityBytes = 1024 << 30 // 1024 GiB
37+
const DoubleDefaultNodeCPURateCapacityNanos = 16 * 1e9 // 16 vcpus
38+
const DoubleDefaultStoreDiskCapacityBytes = 2048 << 30 // 2048 GiB
39+
3540
var (
3641
// DefaultStartTime is used as the default beginning time for simulation
3742
// runs. It isn't necessarily meaningful other than for logging and having

pkg/kv/kvserver/asim/event/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ go_library(
1111
visibility = ["//visibility:public"],
1212
deps = [
1313
"//pkg/kv/kvserver/asim/assertion",
14+
"//pkg/kv/kvserver/asim/config",
1415
"//pkg/kv/kvserver/asim/history",
1516
"//pkg/kv/kvserver/asim/state",
1617
"//pkg/kv/kvserver/liveness/livenesspb",

pkg/kv/kvserver/asim/event/mutation_event.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"fmt"
1111
"strings"
1212

13+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/config"
1314
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/state"
1415
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
1516
"github.com/cockroachdb/cockroach/pkg/roachpb"
@@ -123,14 +124,14 @@ func (se SetSpanConfigEvent) String() string {
123124

124125
func (ae AddNodeEvent) Func() EventFunc {
125126
return MutationFunc(func(ctx context.Context, s state.State) {
126-
node := s.AddNode()
127+
// TDOO(wenyihu6): should we change AddNode to take in
128+
var locality roachpb.Locality
127129
if ae.LocalityString != "" {
128-
var locality roachpb.Locality
129130
if err := locality.Set(ae.LocalityString); err != nil {
130131
panic(fmt.Sprintf("unable to set node locality %s", err.Error()))
131132
}
132-
s.SetNodeLocality(node.NodeID(), locality)
133133
}
134+
node := s.AddNode(config.DefaultNodeCPURateCapacityNanos, locality)
134135
for i := 0; i < ae.NumStores; i++ {
135136
if _, ok := s.AddStore(node.NodeID()); !ok {
136137
panic(fmt.Sprintf("adding store to node=%d failed", node))

pkg/kv/kvserver/asim/history/history.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func (h *History) ShowRecordedValueAt(idx int, stat string) (string, bool) {
6565
}
6666
storeID := h.Recorded[idx][i].StoreID
6767

68-
if stat == "disk_fraction_used" {
68+
if stat == "disk_fraction_used" || stat == "cpu_util" {
6969
_, _ = fmt.Fprintf(&buf, "s%v=%.2f", storeID, v)
7070
} else {
7171
_, _ = fmt.Fprintf(&buf, "s%v=%.0f", storeID, v)

pkg/kv/kvserver/asim/metrics/series.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ func MakeTS(metrics [][]StoreMetrics) map[string][][]float64 {
2222
// are partially duplicated with the cluster tracker.
2323
ret["qps"] = make([][]float64, stores)
2424
ret["cpu"] = make([][]float64, stores)
25+
ret["cpu_util"] = make([][]float64, stores)
2526
ret["write"] = make([][]float64, stores)
2627
ret["write_b"] = make([][]float64, stores)
2728
ret["write_bytes_per_second"] = make([][]float64, stores)
@@ -40,6 +41,7 @@ func MakeTS(metrics [][]StoreMetrics) map[string][][]float64 {
4041
for i, sm := range sms {
4142
ret["qps"][i] = append(ret["qps"][i], float64(sm.QPS))
4243
ret["cpu"][i] = append(ret["cpu"][i], float64(sm.CPU))
44+
ret["cpu_util"][i] = append(ret["cpu_util"][i], sm.NodeCPUUtilization)
4345
ret["write"][i] = append(ret["write"][i], float64(sm.WriteKeys))
4446
ret["write_b"][i] = append(ret["write_b"][i], float64(sm.WriteBytes))
4547
ret["write_bytes_per_second"][i] = append(ret["write_bytes_per_second"][i], float64(sm.WriteBytesPerSecond))

pkg/kv/kvserver/asim/metrics/tracker.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ type StoreMetrics struct {
2222
StoreID int64
2323
QPS int64
2424
CPU int64
25+
NodeCPUUtilization float64
2526
WriteKeys int64
2627
WriteBytes int64
2728
WriteBytesPerSecond int64
@@ -75,6 +76,8 @@ func (sm *StoreMetrics) GetMetricValue(stat string) float64 {
7576
return float64(sm.RangeSplits)
7677
case "disk_fraction_used":
7778
return sm.DiskFractionUsed
79+
case "cpu_util":
80+
return sm.NodeCPUUtilization
7881
default:
7982
return 0
8083
}
@@ -139,12 +142,21 @@ func (mt *Tracker) Tick(ctx context.Context, tick time.Time, s state.State) {
139142
}
140143

141144
desc := store.Descriptor()
145+
nodeCapacity := s.NodeCapacity(store.NodeID())
146+
147+
// NodeCPURateUsage is the same as StoresCPURate in asim.
148+
if nodeCapacity.NodeCPURateCapacity == 0 {
149+
panic(fmt.Sprintf("unexpected: node cpu rate capacity is 0 (node cpu rate usage = %d)",
150+
nodeCapacity.NodeCPURateUsage))
151+
}
152+
cpuUtil := float64(nodeCapacity.NodeCPURateUsage) / float64(nodeCapacity.NodeCPURateCapacity)
142153

143154
sm := StoreMetrics{
144155
Tick: tick,
145156
StoreID: int64(storeID),
146157
QPS: int64(desc.Capacity.QueriesPerSecond),
147158
CPU: int64(desc.Capacity.CPUPerSecond),
159+
NodeCPUUtilization: cpuUtil,
148160
WriteKeys: u.WriteKeys,
149161
WriteBytes: u.WriteBytes,
150162
WriteBytesPerSecond: int64(desc.Capacity.WriteBytesPerSecond),

pkg/kv/kvserver/asim/state/config_loader.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ var AllClusterOptions = [...]string{"single_region", "single_region_multi_store"
2626
// SingleRegionConfig is a simple cluster config with a single region and 3
2727
// zones, all have the same number of nodes.
2828
var SingleRegionConfig = ClusterInfo{
29-
StoreDiskCapacityBytes: 1024 << 30, // 1024 GiB
29+
NodeCPURateCapacityNanos: config.DefaultNodeCPURateCapacityNanos, // 8vpucs
30+
StoreDiskCapacityBytes: config.DefaultStoreDiskCapacityBytes, // 1024 GiB
3031
Regions: []Region{
3132
{
3233
Name: "US",
@@ -42,7 +43,8 @@ var SingleRegionConfig = ClusterInfo{
4243
// SingleRegionMultiStoreConfig is a simple cluster config with a single region
4344
// and 3 zones, all zones have 1 node and 5 stores per node.
4445
var SingleRegionMultiStoreConfig = ClusterInfo{
45-
StoreDiskCapacityBytes: 1024 << 30, // 1024 GiB
46+
NodeCPURateCapacityNanos: config.DefaultNodeCPURateCapacityNanos, // 8 vcpus
47+
StoreDiskCapacityBytes: config.DefaultStoreDiskCapacityBytes, // 1024 GiB
4648
Regions: []Region{
4749
{
4850
Name: "US",
@@ -57,7 +59,8 @@ var SingleRegionMultiStoreConfig = ClusterInfo{
5759

5860
// MultiRegionConfig is a perfectly balanced cluster config with 3 regions.
5961
var MultiRegionConfig = ClusterInfo{
60-
StoreDiskCapacityBytes: 2048 << 30, // 2048 GiB
62+
NodeCPURateCapacityNanos: config.DoubleDefaultNodeCPURateCapacityNanos, // 16 vcpus
63+
StoreDiskCapacityBytes: config.DoubleDefaultStoreDiskCapacityBytes, // 2048 GiB
6164
Regions: []Region{
6265
{
6366
Name: "US_East",
@@ -88,7 +91,8 @@ var MultiRegionConfig = ClusterInfo{
8891

8992
// ComplexConfig is an imbalanced multi-region cluster config.
9093
var ComplexConfig = ClusterInfo{
91-
StoreDiskCapacityBytes: 2048 << 30, // 2048 GiB
94+
NodeCPURateCapacityNanos: config.DoubleDefaultNodeCPURateCapacityNanos, // 16 vcpus
95+
StoreDiskCapacityBytes: config.DoubleDefaultStoreDiskCapacityBytes, // 2048 GiB
9296
Regions: []Region{
9397
{
9498
Name: "US_East",
@@ -371,9 +375,7 @@ func LoadClusterInfo(c ClusterInfo, settings *config.SimulationSettings) State {
371375
Tiers: []roachpb.Tier{regionTier, zoneTier},
372376
}
373377
for i := 0; i < z.NodeCount; i++ {
374-
node := s.AddNode()
375-
s.SetNodeLocality(node.NodeID(), locality)
376-
s.SetNodeCPURateCapacity(node.NodeID(), c.NodeCPURateCapacityNanos)
378+
node := s.AddNode(c.NodeCPURateCapacityNanos, locality)
377379
storesRequired := z.StoresPerNode
378380
if storesRequired < 1 {
379381
panic(fmt.Sprintf("storesPerNode cannot be less than one but found %v", storesRequired))

pkg/kv/kvserver/asim/state/config_loader_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,11 @@ func TestLoadRangesInfo(t *testing.T) {
158158
t.Run(tc.desc, func(t *testing.T) {
159159
settings := config.DefaultSimulationSettings()
160160
state := NewState(settings)
161-
_, ok := state.AddStore(state.AddNode().NodeID())
161+
_, ok := state.AddStore(state.AddNode(-1, roachpb.Locality{}).NodeID())
162162
require.True(t, ok)
163-
_, ok = state.AddStore(state.AddNode().NodeID())
163+
_, ok = state.AddStore(state.AddNode(-1, roachpb.Locality{}).NodeID())
164164
require.True(t, ok)
165-
_, ok = state.AddStore(state.AddNode().NodeID())
165+
_, ok = state.AddStore(state.AddNode(-1, roachpb.Locality{}).NodeID())
166166
require.True(t, ok)
167167

168168
if tc.expectPanic {

pkg/kv/kvserver/asim/state/impl.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -419,9 +419,9 @@ func (s *state) Replicas(storeID StoreID) []Replica {
419419
return repls
420420
}
421421

422-
// AddNode modifies the state to include one additional node. This cannot
423-
// fail. The new Node is returned.
424-
func (s *state) AddNode() Node {
422+
// AddNode modifies the state to include one additional node. This cannot fail.
423+
// The new Node is returned.
424+
func (s *state) AddNode(nodeCPUCapacity int64, locality roachpb.Locality) Node {
425425
s.nodeSeqGen++
426426
nodeID := s.nodeSeqGen
427427
mmAllocator := mmaprototype.NewAllocatorState(s.clock, rand.New(rand.NewSource(s.settings.Seed)))
@@ -437,6 +437,8 @@ func (s *state) AddNode() Node {
437437
}
438438
s.nodes[nodeID] = node
439439
s.SetNodeLiveness(nodeID, livenesspb.NodeLivenessStatus_LIVE)
440+
s.SetNodeLocality(nodeID, locality)
441+
s.SetNodeCPURateCapacity(nodeID, nodeCPUCapacity)
440442
return node
441443
}
442444
func (s *state) SetNodeLocality(nodeID NodeID, locality roachpb.Locality) {

pkg/kv/kvserver/asim/state/new_state_test_helper.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ func NewStateWithDistribution(
2727
// Currently multi-store is not tested for correctness. Default to a single
2828
// store per node.
2929
clusterInfo := ClusterInfoWithStoreCount(numNodes, 1 /* storesPerNode */)
30+
clusterInfo.NodeCPURateCapacityNanos = config.DefaultNodeCPURateCapacityNanos
3031
s := LoadClusterInfo(clusterInfo, settings)
3132

3233
stores := make([]StoreID, numNodes)

0 commit comments

Comments
 (0)