diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml index 4466a04cde36..c9203c0d33c3 100644 --- a/docs/generated/metrics/metrics.yaml +++ b/docs/generated/metrics/metrics.yaml @@ -10045,6 +10045,14 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: sys.gc.total.ns + exported_name: sys_gc_total_ns + description: Estimated total CPU time spent performing GC tasks + y_axis_label: CPU Time + type: COUNTER + unit: NANOSECONDS + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sys.go.allocbytes exported_name: sys_go_allocbytes description: Current bytes of memory allocated by go @@ -10088,6 +10096,22 @@ layers: unit: BYTES aggregation: AVG derivative: NONE + - name: sys.go.heap.livebytes + exported_name: sys_go_heap_livebytes + description: Bytes of live heap objects marked by the previous GC + y_axis_label: Memory + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE + - name: sys.go.heap.objects + exported_name: sys_go_heap_objects + description: Number of live objects on the heap (live + unswept) + y_axis_label: Objects + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE - name: sys.go.limitbytes exported_name: sys_go_limitbytes description: Go soft memory limit diff --git a/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml b/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml index 3e1f8f90228b..85deb86b6bc4 100644 --- a/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml +++ b/pkg/roachprod/agents/opentelemetry/files/cockroachdb_metrics.yaml @@ -2541,11 +2541,14 @@ sys_gc_count: sys.gc.count sys_gc_pause_ns: sys.gc.pause.ns sys_gc_pause_percent: sys.gc.pause.percent sys_gc_stop_ns: sys.gc.stop.ns +sys_gc_total_ns: sys.gc.total.ns sys_go_allocbytes: sys.go.allocbytes sys_go_heap_allocbytes: sys.go.heap.allocbytes sys_go_heap_heapfragmentbytes: sys.go.heap.heapfragmentbytes sys_go_heap_heapreleasedbytes: sys.go.heap.heapreleasedbytes sys_go_heap_heapreservedbytes: sys.go.heap.heapreservedbytes +sys_go_heap_livebytes: sys.go.heap.livebytes +sys_go_heap_objects: sys.go.heap.objects sys_go_limitbytes: sys.go.limitbytes sys_go_pause_other_ns: sys.go.pause.other.ns sys_go_stack_systembytes: sys.go.stack.systembytes diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go index ac08b412a10b..a4c805f8552f 100644 --- a/pkg/server/status/runtime.go +++ b/pkg/server/status/runtime.go @@ -109,6 +109,18 @@ var ( Unit: metric.Unit_BYTES, Visibility: metric.Metadata_SUPPORT, } + metaGoHeapObjects = metric.Metadata{ + Name: "sys.go.heap.objects", + Help: "Number of live objects on the heap (live + unswept)", + Measurement: "Objects", + Unit: metric.Unit_COUNT, + } + metaGoHeapLiveBytes = metric.Metadata{ + Name: "sys.go.heap.livebytes", + Help: "Bytes of live heap objects marked by the previous GC", + Measurement: "Memory", + Unit: metric.Unit_BYTES, + } metaCgoAllocBytes = metric.Metadata{ Name: "sys.cgo.allocbytes", Help: "Current bytes of memory allocated by cgo", @@ -156,6 +168,12 @@ var ( Measurement: "CPU Time", Unit: metric.Unit_NANOSECONDS, } + metaGCTotalNS = metric.Metadata{ + Name: "sys.gc.total.ns", + Help: "Estimated total CPU time spent performing GC tasks", + Measurement: "CPU Time", + Unit: metric.Unit_NANOSECONDS, + } metaNonGCPauseNS = metric.Metadata{ Name: "sys.go.pause.other.ns", Help: "Estimated non-GC-related total pause time", @@ -549,6 +567,9 @@ const runtimeMetricGCStopTotal = "/sched/pauses/stopping/gc:seconds" // Compare only with other /cpu/classes metrics. const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds" +// Estimated total CPU time spent performing GC tasks. +const runtimeMetricGCTotal = "/cpu/classes/gc/total:cpu-seconds" + // Distribution of individual non-GC-related stop-the-world // pause latencies. This is the time from deciding to stop the // world until the world is started again. Some of this time @@ -617,8 +638,16 @@ const runtimeMetricGoLimit = "/gc/gomemlimit:bytes" // Count of all completed GC cycles. const runtimeMetricGCCount = "/gc/cycles/total:gc-cycles" +// Number of objects, live and unswept, occupying heap memory. +const runtimeMetricHeapObjects = "/gc/heap/objects:objects" + +// Heap memory occupied by live objects that were marked by the +// previous GC. +const runtimeMetricHeapLiveBytes = "/gc/heap/live:bytes" + var runtimeMetrics = []string{ runtimeMetricGCAssist, + runtimeMetricGCTotal, runtimeMetricGoTotal, runtimeMetricHeapAlloc, runtimeMetricGoLimit, @@ -629,6 +658,8 @@ var runtimeMetrics = []string{ runtimeMetricMemStackOSBytes, runtimeMetricCumulativeAlloc, runtimeMetricGCCount, + runtimeMetricHeapObjects, + runtimeMetricHeapLiveBytes, runtimeMetricGCPauseTotal, runtimeMetricNonGCPauseTotal, runtimeMetricGCStopTotal, @@ -804,6 +835,8 @@ type RuntimeStatSampler struct { GoHeapReservedBytes *metric.Gauge GoHeapReleasedBytes *metric.Gauge GoTotalAllocBytes *metric.Counter + GoHeapObjects *metric.Gauge + GoHeapLiveBytes *metric.Gauge CgoAllocBytes *metric.Gauge CgoTotalBytes *metric.Gauge GcCount *metric.Counter @@ -813,6 +846,7 @@ type RuntimeStatSampler struct { NonGcStopNS *metric.Gauge GcPausePercent *metric.GaugeFloat64 GcAssistNS *metric.Counter + GcTotalNS *metric.Counter // CPU stats for the CRDB process usage. CPUUserNS *metric.Counter CPUUserPercent *metric.GaugeFloat64 @@ -906,6 +940,8 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta GoHeapReservedBytes: metric.NewGauge(metaGoHeapReservedBytes), GoHeapReleasedBytes: metric.NewGauge(metaGoHeapReleasedBytes), GoTotalAllocBytes: metric.NewCounter(metaGoTotalAllocBytes), + GoHeapObjects: metric.NewGauge(metaGoHeapObjects), + GoHeapLiveBytes: metric.NewGauge(metaGoHeapLiveBytes), CgoAllocBytes: metric.NewGauge(metaCgoAllocBytes), CgoTotalBytes: metric.NewGauge(metaCgoTotalBytes), GcCount: metric.NewCounter(metaGCCount), @@ -913,6 +949,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta GcStopNS: metric.NewGauge(metaGCStopNS), GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent), GcAssistNS: metric.NewCounter(metaGCAssistNS), + GcTotalNS: metric.NewCounter(metaGCTotalNS), NonGcPauseNS: metric.NewGauge(metaNonGCPauseNS), NonGcStopNS: metric.NewGauge(metaNonGCStopNS), @@ -1206,6 +1243,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, cs *CGoMem rsr.GoHeapReservedBytes.Update(int64(heapReservedBytes)) rsr.GoHeapReleasedBytes.Update(int64(heapReleasedBytes)) rsr.GoTotalAllocBytes.Update(int64(rsr.goRuntimeSampler.uint64(runtimeMetricCumulativeAlloc))) + rsr.GoHeapObjects.Update(int64(rsr.goRuntimeSampler.uint64(runtimeMetricHeapObjects))) + rsr.GoHeapLiveBytes.Update(int64(rsr.goRuntimeSampler.uint64(runtimeMetricHeapLiveBytes))) rsr.CgoCalls.Update(numCgoCall) rsr.Goroutines.Update(int64(numGoroutine)) rsr.RunnableGoroutinesPerCPU.Update(runnableAvg) @@ -1216,6 +1255,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, cs *CGoMem rsr.GcStopNS.Update(gcStopTotalNs) rsr.GcPausePercent.Update(gcPauseRatio) rsr.GcAssistNS.Update(gcAssistNS) + rsr.GcTotalNS.Update(int64(rsr.goRuntimeSampler.float64(runtimeMetricGCTotal) * 1e9)) rsr.NonGcPauseNS.Update(nonGcPauseTotalNs) rsr.NonGcStopNS.Update(nonGcStopTotalNs)