Skip to content

Commit 96e7205

Browse files
committed
load: fail proactively in NodeCapacityProvider.Run when CPU usage can't be read
CPU Usage can fail to be read on rare, unique systems. In this situation, we should fail proactively before starting up the asynchronous node capacity provider since it relies on being able to read CPU utilization. This is achieved by introducing a call to `n.runtimeLoadMonitor.recordCPUUsage` prior to starting the asynchronous monitor, which is easier for orchestrators to handle. Epic: CRDB-55052 Fixes #156846 Release note: none
1 parent 33ebee8 commit 96e7205

File tree

1 file changed

+19
-10
lines changed

1 file changed

+19
-10
lines changed

pkg/kv/kvserver/load/node_capacity_provider.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,14 @@ func NewNodeCapacityProvider(
7373

7474
// Run starts the background monitoring of cpu metrics.
7575
func (n *NodeCapacityProvider) Run(ctx context.Context) {
76+
// Record CPU usage and capacity prior to starting the async job to verify
77+
// that we're able to read CPU utilization metrics at all.
78+
err := n.runtimeLoadMonitor.recordCPUUsage(ctx)
79+
if err != nil {
80+
log.KvDistribution.Fatalf(ctx, "failed to record cpu usage: %v", err)
81+
return
82+
}
83+
7684
_ = n.runtimeLoadMonitor.stopper.RunAsyncTask(ctx, "runtime-load-monitor", func(ctx context.Context) {
7785
n.runtimeLoadMonitor.run(ctx)
7886
})
@@ -137,25 +145,23 @@ func (m *runtimeLoadMonitor) GetCPUStats() (cpuUsageNanoPerSec int64, cpuCapacit
137145
}
138146

139147
// recordCPUUsage samples and records the current cpu usage of the node.
140-
func (m *runtimeLoadMonitor) recordCPUUsage(ctx context.Context) {
148+
func (m *runtimeLoadMonitor) recordCPUUsage(ctx context.Context) error {
141149
m.mu.Lock()
142150
defer m.mu.Unlock()
143151
userTimeMillis, sysTimeMillis, err := status.GetProcCPUTime(ctx)
144152
if err != nil {
145-
if buildutil.CrdbTestBuild {
146-
panic(err)
147-
}
148-
// TODO(wenyihu6): we should revisit error handling here for production.
149-
log.KvDistribution.Warningf(ctx, "failed to get cpu usage: %v", err)
153+
return errors.NewAssertionErrorWithWrappedErrf(err, "failed to get cpu usage")
150154
}
151155
// Convert milliseconds to nanoseconds.
152156
totalUsageNanos := float64(userTimeMillis*1e6 + sysTimeMillis*1e6)
153-
if buildutil.CrdbTestBuild && m.mu.lastTotalUsageNanos > totalUsageNanos {
154-
panic(errors.Newf("programming error: last cpu usage is larger than current: %v > %v",
155-
m.mu.lastTotalUsageNanos, totalUsageNanos))
157+
if totalUsageNanos < m.mu.lastTotalUsageNanos {
158+
log.KvDistribution.Warningf(ctx, "last cpu usage is larger than current: %v > %v",
159+
m.mu.lastTotalUsageNanos, totalUsageNanos)
160+
totalUsageNanos = m.mu.lastTotalUsageNanos
156161
}
157162
m.mu.usageEWMA.Add(totalUsageNanos - m.mu.lastTotalUsageNanos)
158163
m.mu.lastTotalUsageNanos = totalUsageNanos
164+
return nil
159165
}
160166

161167
// recordCPUCapacity samples and records the current cpu capacity of the node.
@@ -189,7 +195,10 @@ func (m *runtimeLoadMonitor) run(ctx context.Context) {
189195
return
190196
case <-usageTimer.C:
191197
usageTimer.Reset(m.usageRefreshInterval)
192-
m.recordCPUUsage(ctx)
198+
err := m.recordCPUUsage(ctx)
199+
if err != nil {
200+
log.KvDistribution.Warningf(ctx, "failed to record cpu usage: %v", err)
201+
}
193202
case <-capacityTimer.C:
194203
capacityTimer.Reset(m.capacityRefreshInterval)
195204
m.recordCPUCapacity(ctx)

0 commit comments

Comments
 (0)