From c521d9261f8080ec9d325a9f81decf181d1c90df Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 7 Jan 2026 11:09:44 +0100 Subject: [PATCH 1/3] status: expose Go heap object count metrics Add two new metrics sourced from Go's runtime/metrics package: - sys.go.heap.objects: number of objects (live + unswept) on the heap, from /gc/heap/objects:objects - sys.go.heap.livebytes: bytes of live heap objects marked by the previous GC, from /gc/heap/live:bytes These complement the existing heap byte metrics and provide visibility into object allocation patterns, which can be useful for diagnosing GC pressure from high object churn. Release note: None --- pkg/server/status/runtime.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go index ac08b412a10b..ce6b00a9711a 100644 --- a/pkg/server/status/runtime.go +++ b/pkg/server/status/runtime.go @@ -109,6 +109,18 @@ var ( Unit: metric.Unit_BYTES, Visibility: metric.Metadata_SUPPORT, } + metaGoHeapObjects = metric.Metadata{ + Name: "sys.go.heap.objects", + Help: "Number of live objects on the heap (live + unswept)", + Measurement: "Objects", + Unit: metric.Unit_COUNT, + } + metaGoHeapLiveBytes = metric.Metadata{ + Name: "sys.go.heap.livebytes", + Help: "Bytes of live heap objects marked by the previous GC", + Measurement: "Memory", + Unit: metric.Unit_BYTES, + } metaCgoAllocBytes = metric.Metadata{ Name: "sys.cgo.allocbytes", Help: "Current bytes of memory allocated by cgo", @@ -617,6 +629,13 @@ const runtimeMetricGoLimit = "/gc/gomemlimit:bytes" // Count of all completed GC cycles. const runtimeMetricGCCount = "/gc/cycles/total:gc-cycles" +// Number of objects, live and unswept, occupying heap memory. +const runtimeMetricHeapObjects = "/gc/heap/objects:objects" + +// Heap memory occupied by live objects that were marked by the +// previous GC. +const runtimeMetricHeapLiveBytes = "/gc/heap/live:bytes" + var runtimeMetrics = []string{ runtimeMetricGCAssist, runtimeMetricGoTotal, @@ -629,6 +648,8 @@ var runtimeMetrics = []string{ runtimeMetricMemStackOSBytes, runtimeMetricCumulativeAlloc, runtimeMetricGCCount, + runtimeMetricHeapObjects, + runtimeMetricHeapLiveBytes, runtimeMetricGCPauseTotal, runtimeMetricNonGCPauseTotal, runtimeMetricGCStopTotal, @@ -804,6 +825,8 @@ type RuntimeStatSampler struct { GoHeapReservedBytes *metric.Gauge GoHeapReleasedBytes *metric.Gauge GoTotalAllocBytes *metric.Counter + GoHeapObjects *metric.Gauge + GoHeapLiveBytes *metric.Gauge CgoAllocBytes *metric.Gauge CgoTotalBytes *metric.Gauge GcCount *metric.Counter @@ -906,6 +929,8 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta GoHeapReservedBytes: metric.NewGauge(metaGoHeapReservedBytes), GoHeapReleasedBytes: metric.NewGauge(metaGoHeapReleasedBytes), GoTotalAllocBytes: metric.NewCounter(metaGoTotalAllocBytes), + GoHeapObjects: metric.NewGauge(metaGoHeapObjects), + GoHeapLiveBytes: metric.NewGauge(metaGoHeapLiveBytes), CgoAllocBytes: metric.NewGauge(metaCgoAllocBytes), CgoTotalBytes: metric.NewGauge(metaCgoTotalBytes), GcCount: metric.NewCounter(metaGCCount), @@ -1206,6 +1231,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, cs *CGoMem rsr.GoHeapReservedBytes.Update(int64(heapReservedBytes)) rsr.GoHeapReleasedBytes.Update(int64(heapReleasedBytes)) rsr.GoTotalAllocBytes.Update(int64(rsr.goRuntimeSampler.uint64(runtimeMetricCumulativeAlloc))) + rsr.GoHeapObjects.Update(int64(rsr.goRuntimeSampler.uint64(runtimeMetricHeapObjects))) + rsr.GoHeapLiveBytes.Update(int64(rsr.goRuntimeSampler.uint64(runtimeMetricHeapLiveBytes))) rsr.CgoCalls.Update(numCgoCall) rsr.Goroutines.Update(int64(numGoroutine)) rsr.RunnableGoroutinesPerCPU.Update(runnableAvg) From 7c2f86f221bc84508d4dcf26bd39961dbddd0971 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 7 Jan 2026 11:12:53 +0100 Subject: [PATCH 2/3] status: expose total GC CPU time metric Add sys.gc.total.ns metric sourced from Go's /cpu/classes/gc/total:cpu-seconds runtime metric. This provides visibility into total CPU time spent on GC tasks, complementing the existing sys.gc.assist.ns which only captures mutator assist time. Release note: None --- pkg/server/status/runtime.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go index ce6b00a9711a..a4c805f8552f 100644 --- a/pkg/server/status/runtime.go +++ b/pkg/server/status/runtime.go @@ -168,6 +168,12 @@ var ( Measurement: "CPU Time", Unit: metric.Unit_NANOSECONDS, } + metaGCTotalNS = metric.Metadata{ + Name: "sys.gc.total.ns", + Help: "Estimated total CPU time spent performing GC tasks", + Measurement: "CPU Time", + Unit: metric.Unit_NANOSECONDS, + } metaNonGCPauseNS = metric.Metadata{ Name: "sys.go.pause.other.ns", Help: "Estimated non-GC-related total pause time", @@ -561,6 +567,9 @@ const runtimeMetricGCStopTotal = "/sched/pauses/stopping/gc:seconds" // Compare only with other /cpu/classes metrics. const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds" +// Estimated total CPU time spent performing GC tasks. +const runtimeMetricGCTotal = "/cpu/classes/gc/total:cpu-seconds" + // Distribution of individual non-GC-related stop-the-world // pause latencies. This is the time from deciding to stop the // world until the world is started again. Some of this time @@ -638,6 +647,7 @@ const runtimeMetricHeapLiveBytes = "/gc/heap/live:bytes" var runtimeMetrics = []string{ runtimeMetricGCAssist, + runtimeMetricGCTotal, runtimeMetricGoTotal, runtimeMetricHeapAlloc, runtimeMetricGoLimit, @@ -836,6 +846,7 @@ type RuntimeStatSampler struct { NonGcStopNS *metric.Gauge GcPausePercent *metric.GaugeFloat64 GcAssistNS *metric.Counter + GcTotalNS *metric.Counter // CPU stats for the CRDB process usage. CPUUserNS *metric.Counter CPUUserPercent *metric.GaugeFloat64 @@ -938,6 +949,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta GcStopNS: metric.NewGauge(metaGCStopNS), GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent), GcAssistNS: metric.NewCounter(metaGCAssistNS), + GcTotalNS: metric.NewCounter(metaGCTotalNS), NonGcPauseNS: metric.NewGauge(metaNonGCPauseNS), NonGcStopNS: metric.NewGauge(metaNonGCStopNS), @@ -1243,6 +1255,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, cs *CGoMem rsr.GcStopNS.Update(gcStopTotalNs) rsr.GcPausePercent.Update(gcPauseRatio) rsr.GcAssistNS.Update(gcAssistNS) + rsr.GcTotalNS.Update(int64(rsr.goRuntimeSampler.float64(runtimeMetricGCTotal) * 1e9)) rsr.NonGcPauseNS.Update(nonGcPauseTotalNs) rsr.NonGcStopNS.Update(nonGcStopTotalNs) From e98da3d1826019ea6e7997c53751505554ebbc80 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Thu, 8 Jan 2026 17:35:19 +0100 Subject: [PATCH 3/3] metrics: regenerate --- docs/generated/metrics/metrics.yaml | 24 +++++++++++++++++++ .../files/cockroachdb_metrics.yaml | 3 +++ 2 files changed, 27 insertions(+) diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml index 4466a04cde36..c9203c0d33c3 100644 --- a/docs/generated/metrics/metrics.yaml +++ b/docs/generated/metrics/metrics.yaml @@ -10045,6 +10045,14 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: sys.gc.total.ns + exported_name: sys_gc_total_ns + description: Estimated total CPU time spent performing GC tasks + y_axis_label: CPU Time + type: COUNTER + unit: NANOSECONDS + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sys.go.allocbytes exported_name: sys_go_allocbytes description: Current bytes of memory allocated by go @@ -10088,6 +10096,22 @@ layers: unit: BYTES aggregation: AVG derivative: NONE + - name: sys.go.heap.livebytes + exported_name: sys_go_heap_livebytes + description: Bytes of live heap objects marked by the previous GC + y_axis_label: Memory + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE + - name: sys.go.heap.objects + exported_name: sys_go_heap_objects + description: Number of live objects on the heap (live + unswept) + y_axis_label: Objects + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE - name: sys.go.limitbytes exported_name: sys_go_limitbytes description: Go soft memory limit diff --git a/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml b/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml index 3e1f8f90228b..85deb86b6bc4 100644 --- a/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml +++ b/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml @@ -2541,11 +2541,14 @@ sys_gc_count: sys.gc.count sys_gc_pause_ns: sys.gc.pause.ns sys_gc_pause_percent: sys.gc.pause.percent sys_gc_stop_ns: sys.gc.stop.ns +sys_gc_total_ns: sys.gc.total.ns sys_go_allocbytes: sys.go.allocbytes sys_go_heap_allocbytes: sys.go.heap.allocbytes sys_go_heap_heapfragmentbytes: sys.go.heap.heapfragmentbytes sys_go_heap_heapreleasedbytes: sys.go.heap.heapreleasedbytes sys_go_heap_heapreservedbytes: sys.go.heap.heapreservedbytes +sys_go_heap_livebytes: sys.go.heap.livebytes +sys_go_heap_objects: sys.go.heap.objects sys_go_limitbytes: sys.go.limitbytes sys_go_pause_other_ns: sys.go.pause.other.ns sys_go_stack_systembytes: sys.go.stack.systembytes