Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cgroup/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type CPUStat struct {
LimitCores float64
}

func (cg Cgroup) CpuStat() *CPUStat {
func (cg *Cgroup) CpuStat() *CPUStat {
cpu, cpuacct := cg.subsystems["cpu"], cg.subsystems["cpuacct"]
if cpu == "" || cpuacct == "" {
st, _ := cg.cpuStatV2()
Expand All @@ -26,7 +26,7 @@ func (cg Cgroup) CpuStat() *CPUStat {
return st
}

func (cg Cgroup) cpuStatV1() (*CPUStat, error) {
func (cg *Cgroup) cpuStatV1() (*CPUStat, error) {
if cg.subsystems["cpu"] == "" || cg.subsystems["cpuacct"] == "" {
return nil, nil
}
Expand Down Expand Up @@ -56,7 +56,7 @@ func (cg Cgroup) cpuStatV1() (*CPUStat, error) {
return res, nil
}

func (cg Cgroup) cpuStatV2() (*CPUStat, error) {
func (cg *Cgroup) cpuStatV2() (*CPUStat, error) {
if cg.subsystems[""] == "" {
return nil, nil
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some avg10=0.00 avg60=0.00 avg300=0.00 total=465907442
full avg10=0.00 avg60=0.00 avg300=0.00 total=463529433
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some avg10=0.00 avg60=0.00 avg300=0.05 total=17657662684
full avg10=0.00 avg60=0.00 avg300=0.05 total=17636951020
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some avg10=0.00 avg60=0.00 avg300=0.00 total=6937313991
full avg10=0.00 avg60=0.00 avg300=0.00 total=6934649214
85 changes: 85 additions & 0 deletions cgroup/psi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package cgroup

import (
"os"
"path"
"strconv"
"strings"

"github.com/coroot/coroot-node-agent/common"
"k8s.io/klog/v2"
)

type PSIStats struct {
CPUSecondsSome float64
CPUSecondsFull float64
MemorySecondsSome float64
MemorySecondsFull float64
IOSecondsSome float64
IOSecondsFull float64
}

type PressureTotals struct {
SomeSecondsTotal float64
FullSecondsTotal float64
}

func (cg *Cgroup) PSI() *PSIStats {
if cg.subsystems[""] == "" {
return nil
}
stats := &PSIStats{}
for _, controller := range []string{"cpu", "memory", "io"} {
p, err := cg.readPressure(controller)
if err != nil {
if !common.IsNotExist(err) {
klog.Warningln(err)
}
return nil
}
switch controller {
case "cpu":
stats.CPUSecondsSome = p.SomeSecondsTotal
stats.CPUSecondsFull = p.FullSecondsTotal
case "memory":
stats.MemorySecondsSome = p.SomeSecondsTotal
stats.MemorySecondsFull = p.FullSecondsTotal
case "io":
stats.IOSecondsSome = p.SomeSecondsTotal
stats.IOSecondsFull = p.FullSecondsTotal
}
}
return stats
}

func (cg *Cgroup) readPressure(controller string) (*PressureTotals, error) {
data, err := os.ReadFile(path.Join(cg2Root, cg.subsystems[""], controller+".pressure"))
if err != nil {
return nil, err
}
pressure := &PressureTotals{}
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
parts := strings.Fields(line)
if len(parts) == 0 {
continue
}
kind := parts[0]
for _, p := range parts[1:] {
if strings.HasPrefix(p, "total=") {
vStr := strings.TrimPrefix(p, "total=")
v, err := strconv.ParseUint(vStr, 10, 64)
if err != nil {
return nil, err
}
switch kind {
case "some":
pressure.SomeSecondsTotal = float64(v) / 1e6 // microseconds to seconds
case "full":
pressure.FullSecondsTotal = float64(v) / 1e6
}
break
}
}
}
return pressure, nil
}
27 changes: 27 additions & 0 deletions cgroup/psi_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package cgroup

import (
"path"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestCgroupPSI(t *testing.T) {
cgRoot = "fixtures/cgroup"
cg2Root = "fixtures/cgroup"

cg, _ := NewFromProcessCgroupFile(path.Join("fixtures/proc/400/cgroup"))
stat := cg.PSI()
require.NotNil(t, stat)
assert.Equal(t, float64(465907442)/1e6, stat.CPUSecondsSome)
assert.Equal(t, float64(463529433)/1e6, stat.CPUSecondsFull)
assert.Equal(t, float64(6937313991)/1e6, stat.MemorySecondsSome)
assert.Equal(t, float64(6934649214)/1e6, stat.MemorySecondsFull)
assert.Equal(t, float64(17657662684)/1e6, stat.IOSecondsSome)
assert.Equal(t, float64(17636951020)/1e6, stat.IOSecondsFull)

cg, _ = NewFromProcessCgroupFile(path.Join("fixtures/proc/100/cgroup"))
assert.Nil(t, cg.PSI())
}
9 changes: 9 additions & 0 deletions containers/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,15 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
}
}

if psi := c.cgroup.PSI(); psi != nil {
ch <- counter(metrics.PsiCPU, psi.CPUSecondsSome, "some")
ch <- counter(metrics.PsiCPU, psi.CPUSecondsFull, "full")
ch <- counter(metrics.PsiMemory, psi.MemorySecondsSome, "some")
ch <- counter(metrics.PsiMemory, psi.MemorySecondsFull, "full")
ch <- counter(metrics.PsiIO, psi.IOSecondsSome, "some")
ch <- counter(metrics.PsiIO, psi.IOSecondsFull, "full")
}

if c.oomKills > 0 {
ch <- counter(metrics.OOMKills, float64(c.oomKills))
}
Expand Down
8 changes: 8 additions & 0 deletions containers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ var metrics = struct {
MemoryCache *prometheus.Desc
OOMKills *prometheus.Desc

PsiCPU *prometheus.Desc
PsiMemory *prometheus.Desc
PsiIO *prometheus.Desc

DiskDelay *prometheus.Desc
DiskSize *prometheus.Desc
DiskUsed *prometheus.Desc
Expand Down Expand Up @@ -71,6 +75,10 @@ var metrics = struct {
MemoryCache: metric("container_resources_memory_cache_bytes", "Amount of page cache memory allocated by the container"),
OOMKills: metric("container_oom_kills_total", "Total number of times the container was terminated by the OOM killer"),

PsiCPU: metric("container_resources_cpu_pressure_waiting_seconds_total", "Total time in seconds tha the container were delayed due to CPU pressure", "kind"),
PsiMemory: metric("container_resources_memory_pressure_waiting_seconds_total", "Total time in seconds that the container were delayed due to memory pressure", "kind"),
PsiIO: metric("container_resources_io_pressure_waiting_seconds_total", "Total time in seconds that the container were delayed due to I/O pressure", "kind"),

DiskDelay: metric("container_resources_disk_delay_seconds_total", "Total time duration processes of the container have been waiting fot I/Os to complete"),
DiskSize: metric("container_resources_disk_size_bytes", "Total capacity of the volume", "mount_point", "device", "volume"),
DiskUsed: metric("container_resources_disk_used_bytes", "Used capacity of the volume", "mount_point", "device", "volume"),
Expand Down
Loading