Skip to content

Commit 0db7074

Browse files
committed
Add new endpoint for resource metrics.
1 parent 5ead497 commit 0db7074

File tree

4 files changed

+358
-8
lines changed

4 files changed

+358
-8
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package collectors
18+
19+
import (
20+
"time"
21+
22+
"k8s.io/component-base/metrics"
23+
"k8s.io/klog"
24+
summary "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
25+
"k8s.io/kubernetes/pkg/kubelet/server/stats"
26+
)
27+
28+
var (
29+
nodeCPUUsageDesc = metrics.NewDesc("node_cpu_usage_seconds",
30+
"Cumulative cpu time consumed by the node in core-seconds",
31+
nil,
32+
nil,
33+
metrics.ALPHA,
34+
"")
35+
36+
nodeMemoryUsageDesc = metrics.NewDesc("node_memory_working_set_bytes",
37+
"Current working set of the node in bytes",
38+
nil,
39+
nil,
40+
metrics.ALPHA,
41+
"")
42+
43+
containerCPUUsageDesc = metrics.NewDesc("container_cpu_usage_seconds",
44+
"Cumulative cpu time consumed by the container in core-seconds",
45+
[]string{"container", "pod", "namespace"},
46+
nil,
47+
metrics.ALPHA,
48+
"")
49+
50+
containerMemoryUsageDesc = metrics.NewDesc("container_memory_working_set_bytes",
51+
"Current working set of the container in bytes",
52+
[]string{"container", "pod", "namespace"},
53+
nil,
54+
metrics.ALPHA,
55+
"")
56+
57+
resouceScrapeResultDesc = metrics.NewDesc("scrape_error",
58+
"1 if there was an error while getting container metrics, 0 otherwise",
59+
nil,
60+
nil,
61+
metrics.ALPHA,
62+
"")
63+
)
64+
65+
// NewResourceMetricsCollector returns a metrics.StableCollector which exports resource metrics
66+
func NewResourceMetricsCollector(provider stats.SummaryProvider) metrics.StableCollector {
67+
return &resourceMetricsCollector{
68+
provider: provider,
69+
}
70+
}
71+
72+
type resourceMetricsCollector struct {
73+
metrics.BaseStableCollector
74+
75+
provider stats.SummaryProvider
76+
}
77+
78+
// Check if resourceMetricsCollector implements necessary interface
79+
var _ metrics.StableCollector = &resourceMetricsCollector{}
80+
81+
// DescribeWithStability implements metrics.StableCollector
82+
func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Desc) {
83+
ch <- nodeCPUUsageDesc
84+
ch <- nodeMemoryUsageDesc
85+
ch <- containerCPUUsageDesc
86+
ch <- containerMemoryUsageDesc
87+
ch <- resouceScrapeResultDesc
88+
}
89+
90+
// CollectWithStability implements metrics.StableCollector
91+
// Since new containers are frequently created and removed, using the Gauge would
92+
// leak metric collectors for containers or pods that no longer exist. Instead, implement
93+
// custom collector in a way that only collects metrics for active containers.
94+
func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metric) {
95+
var errorCount float64
96+
defer func() {
97+
ch <- metrics.NewLazyConstMetric(resouceScrapeResultDesc, metrics.GaugeValue, errorCount)
98+
}()
99+
statsSummary, err := rc.provider.GetCPUAndMemoryStats()
100+
if err != nil {
101+
errorCount = 1
102+
klog.Warningf("Error getting summary for resourceMetric prometheus endpoint: %v", err)
103+
return
104+
}
105+
106+
rc.collectNodeCPUMetrics(ch, statsSummary.Node)
107+
rc.collectNodeMemoryMetrics(ch, statsSummary.Node)
108+
109+
for _, pod := range statsSummary.Pods {
110+
for _, container := range pod.Containers {
111+
rc.collectContainerCPUMetrics(ch, pod, container)
112+
rc.collectContainerMemoryMetrics(ch, pod, container)
113+
}
114+
}
115+
}
116+
117+
func (rc *resourceMetricsCollector) collectNodeCPUMetrics(ch chan<- metrics.Metric, s summary.NodeStats) {
118+
if s.CPU == nil {
119+
return
120+
}
121+
122+
ch <- metrics.NewLazyMetricWithTimestamp(s.CPU.Time.Time,
123+
metrics.NewLazyConstMetric(nodeCPUUsageDesc, metrics.GaugeValue, float64(*s.CPU.UsageCoreNanoSeconds)/float64(time.Second)))
124+
}
125+
126+
func (rc *resourceMetricsCollector) collectNodeMemoryMetrics(ch chan<- metrics.Metric, s summary.NodeStats) {
127+
if s.Memory == nil {
128+
return
129+
}
130+
131+
ch <- metrics.NewLazyMetricWithTimestamp(s.Memory.Time.Time,
132+
metrics.NewLazyConstMetric(nodeMemoryUsageDesc, metrics.GaugeValue, float64(*s.Memory.WorkingSetBytes)))
133+
}
134+
135+
func (rc *resourceMetricsCollector) collectContainerCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) {
136+
if s.CPU == nil {
137+
return
138+
}
139+
140+
ch <- metrics.NewLazyMetricWithTimestamp(s.CPU.Time.Time,
141+
metrics.NewLazyConstMetric(containerCPUUsageDesc, metrics.GaugeValue,
142+
float64(*s.CPU.UsageCoreNanoSeconds)/float64(time.Second), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
143+
}
144+
145+
func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) {
146+
if s.Memory == nil {
147+
return
148+
}
149+
150+
ch <- metrics.NewLazyMetricWithTimestamp(s.Memory.Time.Time,
151+
metrics.NewLazyConstMetric(containerMemoryUsageDesc, metrics.GaugeValue,
152+
float64(*s.Memory.WorkingSetBytes), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
153+
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package collectors
18+
19+
import (
20+
"fmt"
21+
"strings"
22+
"testing"
23+
"time"
24+
25+
"github.com/stretchr/testify/mock"
26+
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/component-base/metrics/testutil"
29+
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
30+
)
31+
32+
type mockSummaryProvider struct {
33+
mock.Mock
34+
}
35+
36+
func (m *mockSummaryProvider) Get(updateStats bool) (*statsapi.Summary, error) {
37+
args := m.Called(updateStats)
38+
return args.Get(0).(*statsapi.Summary), args.Error(1)
39+
}
40+
41+
func (m *mockSummaryProvider) GetCPUAndMemoryStats() (*statsapi.Summary, error) {
42+
args := m.Called()
43+
return args.Get(0).(*statsapi.Summary), args.Error(1)
44+
}
45+
46+
func TestCollectResourceMetrics(t *testing.T) {
47+
testTime := metav1.NewTime(time.Unix(2, 0)) // a static timestamp: 2000
48+
interestedMetrics := []string{
49+
"scrape_error",
50+
"node_cpu_usage_seconds",
51+
"node_memory_working_set_bytes",
52+
"container_cpu_usage_seconds",
53+
"container_memory_working_set_bytes",
54+
}
55+
56+
tests := []struct {
57+
name string
58+
summary *statsapi.Summary
59+
summaryErr error
60+
expectedMetrics string
61+
}{
62+
{
63+
name: "error getting summary",
64+
summary: nil,
65+
summaryErr: fmt.Errorf("failed to get summary"),
66+
expectedMetrics: `
67+
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
68+
# TYPE scrape_error gauge
69+
scrape_error 1
70+
`,
71+
},
72+
{
73+
name: "arbitrary node metrics",
74+
summary: &statsapi.Summary{
75+
Node: statsapi.NodeStats{
76+
CPU: &statsapi.CPUStats{
77+
Time: testTime,
78+
UsageCoreNanoSeconds: uint64Ptr(10000000000),
79+
},
80+
Memory: &statsapi.MemoryStats{
81+
Time: testTime,
82+
WorkingSetBytes: uint64Ptr(1000),
83+
},
84+
},
85+
},
86+
summaryErr: nil,
87+
expectedMetrics: `
88+
# HELP node_cpu_usage_seconds [ALPHA] Cumulative cpu time consumed by the node in core-seconds
89+
# TYPE node_cpu_usage_seconds gauge
90+
node_cpu_usage_seconds 10 2000
91+
# HELP node_memory_working_set_bytes [ALPHA] Current working set of the node in bytes
92+
# TYPE node_memory_working_set_bytes gauge
93+
node_memory_working_set_bytes 1000 2000
94+
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
95+
# TYPE scrape_error gauge
96+
scrape_error 0
97+
`,
98+
},
99+
{
100+
name: "arbitrary container metrics for different container, pods and namespaces",
101+
summary: &statsapi.Summary{
102+
Pods: []statsapi.PodStats{
103+
{
104+
PodRef: statsapi.PodReference{
105+
Name: "pod_a",
106+
Namespace: "namespace_a",
107+
},
108+
Containers: []statsapi.ContainerStats{
109+
{
110+
Name: "container_a",
111+
CPU: &statsapi.CPUStats{
112+
Time: testTime,
113+
UsageCoreNanoSeconds: uint64Ptr(10000000000),
114+
},
115+
Memory: &statsapi.MemoryStats{
116+
Time: testTime,
117+
WorkingSetBytes: uint64Ptr(1000),
118+
},
119+
},
120+
{
121+
Name: "container_b",
122+
CPU: &statsapi.CPUStats{
123+
Time: testTime,
124+
UsageCoreNanoSeconds: uint64Ptr(10000000000),
125+
},
126+
Memory: &statsapi.MemoryStats{
127+
Time: testTime,
128+
WorkingSetBytes: uint64Ptr(1000),
129+
},
130+
},
131+
},
132+
},
133+
{
134+
PodRef: statsapi.PodReference{
135+
Name: "pod_b",
136+
Namespace: "namespace_b",
137+
},
138+
Containers: []statsapi.ContainerStats{
139+
{
140+
Name: "container_a",
141+
CPU: &statsapi.CPUStats{
142+
Time: testTime,
143+
UsageCoreNanoSeconds: uint64Ptr(10000000000),
144+
},
145+
Memory: &statsapi.MemoryStats{
146+
Time: testTime,
147+
WorkingSetBytes: uint64Ptr(1000),
148+
},
149+
},
150+
},
151+
},
152+
},
153+
},
154+
summaryErr: nil,
155+
expectedMetrics: `
156+
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
157+
# TYPE scrape_error gauge
158+
scrape_error 0
159+
# HELP container_cpu_usage_seconds [ALPHA] Cumulative cpu time consumed by the container in core-seconds
160+
# TYPE container_cpu_usage_seconds gauge
161+
container_cpu_usage_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 10 2000
162+
container_cpu_usage_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 10 2000
163+
container_cpu_usage_seconds{container="container_b",namespace="namespace_a",pod="pod_a"} 10 2000
164+
# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
165+
# TYPE container_memory_working_set_bytes gauge
166+
container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 2000
167+
container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 2000
168+
container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 2000
169+
`,
170+
},
171+
}
172+
173+
for _, test := range tests {
174+
tc := test
175+
t.Run(tc.name, func(t *testing.T) {
176+
provider := &mockSummaryProvider{}
177+
provider.On("GetCPUAndMemoryStats").Return(tc.summary, tc.summaryErr)
178+
collector := NewResourceMetricsCollector(provider)
179+
180+
if err := testutil.CustomCollectAndCompare(collector, strings.NewReader(tc.expectedMetrics), interestedMetrics...); err != nil {
181+
t.Fatal(err)
182+
}
183+
})
184+
}
185+
}
186+
187+
func uint64Ptr(u uint64) *uint64 {
188+
return &u
189+
}

pkg/kubelet/server/auth_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ func AuthzTestCases() []AuthzTestCase {
128128
"/metrics/cadvisor": "metrics",
129129
"/metrics/probes": "metrics",
130130
"/metrics/resource/v1alpha1": "metrics",
131+
"/metrics/resource": "metrics",
131132
"/pods/": "proxy",
132133
"/portForward/{podNamespace}/{podID}": "proxy",
133134
"/portForward/{podNamespace}/{podID}/{uid}": "proxy",

pkg/kubelet/server/server.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"github.com/google/cadvisor/metrics"
3939
"google.golang.org/grpc"
4040
"k8s.io/klog"
41+
"k8s.io/kubernetes/pkg/kubelet/metrics/collectors"
4142

4243
"k8s.io/api/core/v1"
4344
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -74,13 +75,13 @@ import (
7475
)
7576

7677
const (
77-
metricsPath = "/metrics"
78-
cadvisorMetricsPath = "/metrics/cadvisor"
79-
resourceMetricsPathPrefix = "/metrics/resource"
80-
proberMetricsPath = "/metrics/probes"
81-
specPath = "/spec/"
82-
statsPath = "/stats/"
83-
logsPath = "/logs/"
78+
metricsPath = "/metrics"
79+
cadvisorMetricsPath = "/metrics/cadvisor"
80+
resourceMetricsPath = "/metrics/resource"
81+
proberMetricsPath = "/metrics/probes"
82+
specPath = "/spec/"
83+
statsPath = "/stats/"
84+
logsPath = "/logs/"
8485
)
8586

8687
// Server is a http.Handler which exposes kubelet functionality over HTTP.
@@ -321,10 +322,16 @@ func (s *Server) InstallDefaultHandlers(enableCAdvisorJSONEndpoints bool) {
321322

322323
v1alpha1ResourceRegistry := compbasemetrics.NewKubeRegistry()
323324
v1alpha1ResourceRegistry.CustomMustRegister(stats.NewPrometheusResourceMetricCollector(s.resourceAnalyzer, v1alpha1.Config()))
324-
s.restfulCont.Handle(path.Join(resourceMetricsPathPrefix, v1alpha1.Version),
325+
s.restfulCont.Handle(path.Join(resourceMetricsPath, v1alpha1.Version),
325326
compbasemetrics.HandlerFor(v1alpha1ResourceRegistry, compbasemetrics.HandlerOpts{ErrorHandling: compbasemetrics.ContinueOnError}),
326327
)
327328

329+
resourceRegistry := compbasemetrics.NewKubeRegistry()
330+
resourceRegistry.CustomMustRegister(collectors.NewResourceMetricsCollector(s.resourceAnalyzer))
331+
s.restfulCont.Handle(resourceMetricsPath,
332+
compbasemetrics.HandlerFor(resourceRegistry, compbasemetrics.HandlerOpts{ErrorHandling: compbasemetrics.ContinueOnError}),
333+
)
334+
328335
// prober metrics are exposed under a different endpoint
329336

330337
p := compbasemetrics.NewKubeRegistry()

0 commit comments

Comments
 (0)