Skip to content

feat: Add actual pod resources metrics #2702

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/metrics/workload/pod-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
| kube_pod_status_container_ready_time | Gauge | Time when the container of the pod entered Ready state. | seconds | `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_container_status_restarts_total | Counter | The number of container restarts per container | | `container`=&lt;container-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `pod`=&lt;pod-name&gt; <br> `uid`=&lt;pod-uid&gt; | STABLE | - |
| kube_pod_container_resource_requests | Gauge | The number of requested request resource by a container. It is recommended to use the `kube_pod_resource_requests` metric exposed by kube-scheduler instead, as it is more precise. | `cpu`=&lt;core&gt; <br> `memory`=&lt;bytes&gt; | `resource`=&lt;resource-name&gt; <br> `unit`=&lt;resource-unit&gt; <br> `container`=&lt;container-name&gt; <br> `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `node`=&lt; node-name&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_container_actual_resource_requests | Gauge | The number of actually requested request resource by a container calculated based on status.containerStatuses of a Pod. | `cpu`=&lt;core&gt; <br> `memory`=&lt;bytes&gt; | `resource`=&lt;resource-name&gt; <br> `unit`=&lt;resource-unit&gt; <br> `container`=&lt;container-name&gt; <br> `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `node`=&lt; node-name&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_container_resource_limits | Gauge | The number of requested limit resource by a container. It is recommended to use the `kube_pod_resource_limits` metric exposed by kube-scheduler instead, as it is more precise. | `cpu`=&lt;core&gt; <br> `memory`=&lt;bytes&gt; | `resource`=&lt;resource-name&gt; <br> `unit`=&lt;resource-unit&gt; <br> `container`=&lt;container-name&gt; <br> `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `node`=&lt; node-name&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_container_actual_resource_limits | Gauge | The number of actually requested limit resource by a containercalculated based on status.containerStatuses of a Pod. | `cpu`=&lt;core&gt; <br> `memory`=&lt;bytes&gt; | `resource`=&lt;resource-name&gt; <br> `unit`=&lt;resource-unit&gt; <br> `container`=&lt;container-name&gt; <br> `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `node`=&lt; node-name&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_overhead_cpu_cores | Gauge | The pod overhead in regards to cpu cores associated with running a pod | core | `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_overhead_memory_bytes | Gauge | The pod overhead in regards to memory associated with running a pod | bytes | `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
| kube_pod_runtimeclass_name_info | Gauge | The runtimeclass associated with the pod | | `pod`=&lt;pod-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `uid`=&lt;pod-uid&gt; | EXPERIMENTAL | - |
Expand Down
137 changes: 137 additions & 0 deletions internal/store/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ func podMetricFamilies(allowAnnotationsList, allowLabelsList []string) []generat
createPodCompletionTimeFamilyGenerator(),
createPodContainerInfoFamilyGenerator(),
createPodContainerResourceLimitsFamilyGenerator(),
createPodContainerActualResourceLimitsFamilyGenerator(),
createPodContainerResourceRequestsFamilyGenerator(),
createPodContainerActualResourceRequestsFamilyGenerator(),
createPodContainerStateStartedFamilyGenerator(),
createPodContainerStatusLastTerminatedReasonFamilyGenerator(),
createPodContainerStatusLastTerminatedExitCodeFamilyGenerator(),
Expand Down Expand Up @@ -165,6 +167,74 @@ func createPodContainerInfoFamilyGenerator() generator.FamilyGenerator {
)
}

func createPodContainerActualResourceLimitsFamilyGenerator() generator.FamilyGenerator {
return *generator.NewFamilyGeneratorWithStability(
"kube_pod_container_actual_resource_limits",
"The number of actually requested limit resource by a container calculated based on status.containerStatuses of a Pod.",
metric.Gauge,
basemetrics.ALPHA,
"",
wrapPodFunc(func(p *v1.Pod) *metric.Family {
ms := []*metric.Metric{}

for _, c := range p.Status.ContainerStatuses {
if c.Resources == nil {
continue
}

lim := c.Resources.Limits

for resourceName, val := range lim {
switch resourceName {
case v1.ResourceCPU:
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitCore)},
Value: convertValueToFloat64(&val),
})
case v1.ResourceStorage:
fallthrough
case v1.ResourceEphemeralStorage:
fallthrough
case v1.ResourceMemory:
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)},
Value: float64(val.Value()),
})
default:
if isHugePageResourceName(resourceName) {
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)},
Value: float64(val.Value()),
})
}
if isAttachableVolumeResourceName(resourceName) {
ms = append(ms, &metric.Metric{
Value: float64(val.Value()),
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)},
})
}
if isExtendedResourceName(resourceName) {
ms = append(ms, &metric.Metric{
Value: float64(val.Value()),
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitInteger)},
})

}
}
}
}

for _, metric := range ms {
metric.LabelKeys = []string{"container", "node", "resource", "unit"}
}

return &metric.Family{
Metrics: ms,
}
}),
)
}

func createPodContainerResourceLimitsFamilyGenerator() generator.FamilyGenerator {
return *generator.NewFamilyGeneratorWithStability(
"kube_pod_container_resource_limits",
Expand Down Expand Up @@ -229,6 +299,73 @@ func createPodContainerResourceLimitsFamilyGenerator() generator.FamilyGenerator
)
}

func createPodContainerActualResourceRequestsFamilyGenerator() generator.FamilyGenerator {
return *generator.NewFamilyGeneratorWithStability(
"kube_pod_container_actual_resource_requests",
"The number of actually requested request resource by a container calculated based on status.containerStatuses of a Pod.",
metric.Gauge,
basemetrics.ALPHA,
"",
wrapPodFunc(func(p *v1.Pod) *metric.Family {
ms := []*metric.Metric{}

for _, c := range p.Status.ContainerStatuses {
if c.Resources == nil {
continue
}

req := c.Resources.Requests

for resourceName, val := range req {
switch resourceName {
case v1.ResourceCPU:
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitCore)},
Value: convertValueToFloat64(&val),
})
case v1.ResourceStorage:
fallthrough
case v1.ResourceEphemeralStorage:
fallthrough
case v1.ResourceMemory:
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)},
Value: float64(val.Value()),
})
default:
if isHugePageResourceName(resourceName) {
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)},
Value: float64(val.Value()),
})
}
if isAttachableVolumeResourceName(resourceName) {
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)},
Value: float64(val.Value()),
})
}
if isExtendedResourceName(resourceName) {
ms = append(ms, &metric.Metric{
LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitInteger)},
Value: float64(val.Value()),
})
}
}
}
}

for _, metric := range ms {
metric.LabelKeys = []string{"container", "node", "resource", "unit"}
}

return &metric.Family{
Metrics: ms,
}
}),
)
}

func createPodContainerResourceRequestsFamilyGenerator() generator.FamilyGenerator {
return *generator.NewFamilyGeneratorWithStability(
"kube_pod_container_resource_requests",
Expand Down
2 changes: 1 addition & 1 deletion internal/store/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2282,7 +2282,7 @@ func BenchmarkPodStore(b *testing.B) {
},
}

expectedFamilies := 55
expectedFamilies := 57
for n := 0; n < b.N; n++ {
families := f(pod)
if len(families) != expectedFamilies {
Expand Down
30 changes: 30 additions & 0 deletions pkg/app/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ func TestFullScrapeCycle(t *testing.T) {

expected := `# HELP kube_pod_annotations Kubernetes annotations converted to Prometheus labels.
# HELP kube_pod_completion_time [STABLE] Completion time in unix timestamp for a pod.
# HELP kube_pod_container_actual_resource_limits The number of actually requested limit resource by a container calculated based on status.containerStatuses of a Pod.
# HELP kube_pod_container_actual_resource_requests The number of actually requested request resource by a container calculated based on status.containerStatuses of a Pod.
# HELP kube_pod_container_info [STABLE] Information about a container in a pod.
# HELP kube_pod_container_resource_limits The number of requested limit resource by a container. It is recommended to use the kube_pod_resource_limits metric exposed by kube-scheduler instead, as it is more precise.
# HELP kube_pod_container_resource_requests The number of requested request resource by a container. It is recommended to use the kube_pod_resource_requests metric exposed by kube-scheduler instead, as it is more precise.
Expand Down Expand Up @@ -263,6 +265,8 @@ func TestFullScrapeCycle(t *testing.T) {
# HELP kube_pod_tolerations Information about the pod tolerations
# TYPE kube_pod_annotations gauge
# TYPE kube_pod_completion_time gauge
# TYPE kube_pod_container_actual_resource_limits gauge
# TYPE kube_pod_container_actual_resource_requests gauge
# TYPE kube_pod_container_info gauge
# TYPE kube_pod_container_resource_limits gauge
# TYPE kube_pod_container_resource_requests gauge
Expand Down Expand Up @@ -315,6 +319,16 @@ func TestFullScrapeCycle(t *testing.T) {
# TYPE kube_pod_status_unschedulable gauge
# TYPE kube_pod_status_unscheduled_time gauge
# TYPE kube_pod_tolerations gauge
kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="cpu",unit="core"} 0.3
kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="ephemeral_storage",unit="byte"} 4e+08
kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="memory",unit="byte"} 2e+08
kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="nvidia_com_gpu",unit="integer"} 2
kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="storage",unit="byte"} 5e+08
kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="cpu",unit="core"} 0.3
kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="ephemeral_storage",unit="byte"} 4e+08
kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="memory",unit="byte"} 2e+08
kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="nvidia_com_gpu",unit="integer"} 2
kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="storage",unit="byte"} 5e+08
kube_pod_container_info{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",image_spec="k8s.gcr.io/hyperkube2_spec",image="k8s.gcr.io/hyperkube2",image_id="docker://sha256:bbb",container_id="docker://cd456"} 1
kube_pod_container_info{namespace="default",pod="pod0",uid="abc-0",container="pod1_con2",image_spec="k8s.gcr.io/hyperkube3_spec",image="k8s.gcr.io/hyperkube3",image_id="docker://sha256:ccc",container_id="docker://ef789"} 1
kube_pod_container_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="cpu",unit="core"} 0.2
Expand Down Expand Up @@ -852,6 +866,22 @@ func pod(client *fake.Clientset, index int) error {
ExitCode: 137,
},
},
Resources: &v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse("300m"),
v1.ResourceMemory: resource.MustParse("200M"),
v1.ResourceEphemeralStorage: resource.MustParse("400M"),
v1.ResourceStorage: resource.MustParse("500M"),
v1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
},
Requests: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse("300m"),
v1.ResourceMemory: resource.MustParse("200M"),
v1.ResourceEphemeralStorage: resource.MustParse("400M"),
v1.ResourceStorage: resource.MustParse("500M"),
v1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
},
},
},
{
Name: "pod1_con2",
Expand Down