Skip to content

Commit a00bf59

Browse files
authored
Merge pull request kubernetes#3129 from kgolab/vpa-updater-metrics
Add bunch of metrics to monitor VPA Updater impact in a cluster
2 parents e3219e6 + ff25eb5 commit a00bf59

File tree

5 files changed

+137
-10
lines changed

5 files changed

+137
-10
lines changed

vertical-pod-autoscaler/pkg/updater/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,3 @@ before pod with 20% memory increase and no change in cpu).
2929

3030
# Missing parts
3131
* Recommendation API for fetching data from Vertical Pod Autoscaler Recommender.
32-
* Monitoring.

vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
apiv1 "k8s.io/api/core/v1"
2525
policyv1 "k8s.io/api/policy/v1beta1"
2626
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27-
metrics_updater "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics/updater"
2827
appsinformer "k8s.io/client-go/informers/apps/v1"
2928
coreinformer "k8s.io/client-go/informers/core/v1"
3029
kube_client "k8s.io/client-go/kubernetes"
@@ -141,7 +140,6 @@ func (e *podsEvictionRestrictionImpl) Evict(podToEvict *apiv1.Pod, eventRecorder
141140
}
142141
eventRecorder.Event(podToEvict, apiv1.EventTypeNormal, "EvictedByVPA",
143142
"Pod was evicted by VPA Updater to apply resource recommendation.")
144-
metrics_updater.AddEvictedPod()
145143

146144
if podToEvict.Status.Phase != apiv1.PodPending {
147145
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]

vertical-pod-autoscaler/pkg/updater/logic/updater.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,31 @@ func (u *updater) RunOnce(ctx context.Context) {
177177
}
178178
timer.ObserveStep("AdmissionInit")
179179

180+
// wrappers for metrics which are computed every loop run
181+
controlledPodsCounter := metrics_updater.NewControlledPodsCounter()
182+
evictablePodsCounter := metrics_updater.NewEvictablePodsCounter()
183+
vpasWithEvictablePodsCounter := metrics_updater.NewVpasWithEvictablePodsCounter()
184+
vpasWithEvictedPodsCounter := metrics_updater.NewVpasWithEvictedPodsCounter()
185+
186+
// using defer to protect against 'return' after evictionRateLimiter.Wait
187+
defer controlledPodsCounter.Observe()
188+
defer evictablePodsCounter.Observe()
189+
defer vpasWithEvictablePodsCounter.Observe()
190+
defer vpasWithEvictedPodsCounter.Observe()
191+
192+
// NOTE: this loop assumes that controlledPods are filtered
193+
// to contain only Pods controlled by a VPA in auto or recreate mode
180194
for vpa, livePods := range controlledPods {
195+
vpaSize := len(livePods)
196+
controlledPodsCounter.Add(vpaSize, vpaSize)
181197
evictionLimiter := u.evictionFactory.NewPodsEvictionRestriction(livePods)
182198
podsForUpdate := u.getPodsUpdateOrder(filterNonEvictablePods(livePods, evictionLimiter), vpa)
199+
evictablePodsCounter.Add(vpaSize, len(podsForUpdate))
183200

201+
withEvictable := false
202+
withEvicted := false
184203
for _, pod := range podsForUpdate {
204+
withEvictable = true
185205
if !evictionLimiter.CanEvict(pod) {
186206
continue
187207
}
@@ -194,8 +214,18 @@ func (u *updater) RunOnce(ctx context.Context) {
194214
evictErr := evictionLimiter.Evict(pod, u.eventRecorder)
195215
if evictErr != nil {
196216
klog.Warningf("evicting pod %v failed: %v", pod.Name, evictErr)
217+
} else {
218+
withEvicted = true
219+
metrics_updater.AddEvictedPod(vpaSize)
197220
}
198221
}
222+
223+
if withEvictable {
224+
vpasWithEvictablePodsCounter.Add(vpaSize, 1)
225+
}
226+
if withEvicted {
227+
vpasWithEvictedPodsCounter.Add(vpaSize, 1)
228+
}
199229
}
200230
timer.ObserveStep("EvictPods")
201231
}

vertical-pod-autoscaler/pkg/utils/metrics/metrics.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
package metrics
1919

2020
import (
21+
"math"
2122
"net/http"
2223
"time"
2324

@@ -39,6 +40,10 @@ type ExecutionTimer struct {
3940
const (
4041
// TopMetricsNamespace is a prefix for all VPA-related metrics namespaces
4142
TopMetricsNamespace = "vpa_"
43+
44+
// The metrics will distinguish VPA sizes up to 2^MaxVpaSizeLog (~1M)
45+
// Anything above that size will be reported in the top bucket.
46+
MaxVpaSizeLog = 20
4247
)
4348

4449
// Initialize sets up Prometheus to expose metrics & (optionally) health-check on the given address
@@ -87,3 +92,17 @@ func CreateExecutionTimeMetric(namespace string, help string) *prometheus.Histog
8792
}, []string{"step"},
8893
)
8994
}
95+
96+
// GetVpaSizeLog2 returns a bucket number for a metric labelled with number of Pods under a given VPA.
97+
// It is basically log2(vpaSize), capped to MaxVpaSizeLog
98+
func GetVpaSizeLog2(vpaSize int) int {
99+
if vpaSize == 0 {
100+
return 0
101+
}
102+
103+
ret := int(math.Log2(float64(vpaSize)))
104+
if ret > MaxVpaSizeLog {
105+
return MaxVpaSizeLog
106+
}
107+
return ret
108+
}

vertical-pod-autoscaler/pkg/utils/metrics/updater/updater.go

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ limitations under the License.
1818
package updater
1919

2020
import (
21+
"strconv"
22+
2123
"github.com/prometheus/client_golang/prometheus"
2224
"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics"
2325
)
@@ -26,13 +28,51 @@ const (
2628
metricsNamespace = metrics.TopMetricsNamespace + "updater"
2729
)
2830

31+
// SizeBasedGauge is a wrapper for incrementally recording values indexed by log2(VPA size)
32+
type SizeBasedGauge struct {
33+
values [metrics.MaxVpaSizeLog]int
34+
gauge *prometheus.GaugeVec
35+
}
36+
2937
var (
30-
evictedCount = prometheus.NewCounter(
38+
controlledCount = prometheus.NewGaugeVec(
39+
prometheus.GaugeOpts{
40+
Namespace: metricsNamespace,
41+
Name: "controlled_pods_total",
42+
Help: "Number of Pods controlled by VPA updater.",
43+
}, []string{"vpa_size_log2"},
44+
)
45+
46+
evictableCount = prometheus.NewGaugeVec(
47+
prometheus.GaugeOpts{
48+
Namespace: metricsNamespace,
49+
Name: "evictable_pods_total",
50+
Help: "Number of Pods matching evicition criteria.",
51+
}, []string{"vpa_size_log2"},
52+
)
53+
54+
evictedCount = prometheus.NewCounterVec(
3155
prometheus.CounterOpts{
3256
Namespace: metricsNamespace,
3357
Name: "evicted_pods_total",
3458
Help: "Number of Pods evicted by Updater to apply a new recommendation.",
35-
},
59+
}, []string{"vpa_size_log2"},
60+
)
61+
62+
vpasWithEvictablePodsCount = prometheus.NewGaugeVec(
63+
prometheus.GaugeOpts{
64+
Namespace: metricsNamespace,
65+
Name: "vpas_with_evictable_pods_total",
66+
Help: "Number of VPA objects with at least one Pod matching evicition criteria.",
67+
}, []string{"vpa_size_log2"},
68+
)
69+
70+
vpasWithEvictedPodsCount = prometheus.NewGaugeVec(
71+
prometheus.GaugeOpts{
72+
Namespace: metricsNamespace,
73+
Name: "vpas_with_evicted_pods_total",
74+
Help: "Number of VPA objects with at least one evicted Pod.",
75+
}, []string{"vpa_size_log2"},
3676
)
3777

3878
functionLatency = metrics.CreateExecutionTimeMetric(metricsNamespace,
@@ -41,16 +81,57 @@ var (
4181

4282
// Register initializes all metrics for VPA Updater
4383
func Register() {
44-
prometheus.MustRegister(evictedCount)
45-
prometheus.MustRegister(functionLatency)
84+
prometheus.MustRegister(controlledCount, evictableCount, evictedCount, vpasWithEvictablePodsCount, vpasWithEvictedPodsCount, functionLatency)
4685
}
4786

4887
// NewExecutionTimer provides a timer for Updater's RunOnce execution
4988
func NewExecutionTimer() *metrics.ExecutionTimer {
5089
return metrics.NewExecutionTimer(functionLatency)
5190
}
5291

53-
// AddEvictedPod increases the counter of pods evicted by VPA
54-
func AddEvictedPod() {
55-
evictedCount.Add(1)
92+
// newSizeBasedGauge provides a wrapper for counting items in a loop
93+
func newSizeBasedGauge(gauge *prometheus.GaugeVec) *SizeBasedGauge {
94+
return &SizeBasedGauge{
95+
values: [metrics.MaxVpaSizeLog]int{},
96+
gauge: gauge,
97+
}
98+
}
99+
100+
// NewControlledPodsCounter returns a wrapper for counting Pods controlled by Updater
101+
func NewControlledPodsCounter() *SizeBasedGauge {
102+
return newSizeBasedGauge(controlledCount)
103+
}
104+
105+
// NewEvictablePodsCounter returns a wrapper for counting Pods which are matching eviction criteria
106+
func NewEvictablePodsCounter() *SizeBasedGauge {
107+
return newSizeBasedGauge(evictableCount)
108+
}
109+
110+
// NewVpasWithEvictablePodsCounter returns a wrapper for counting VPA objects with Pods matching eviction criteria
111+
func NewVpasWithEvictablePodsCounter() *SizeBasedGauge {
112+
return newSizeBasedGauge(vpasWithEvictablePodsCount)
113+
}
114+
115+
// NewVpasWithEvictedPodsCounter returns a wrapper for counting VPA objects with evicted Pods
116+
func NewVpasWithEvictedPodsCounter() *SizeBasedGauge {
117+
return newSizeBasedGauge(vpasWithEvictedPodsCount)
118+
}
119+
120+
// AddEvictedPod increases the counter of pods evicted by Updater, by given VPA size
121+
func AddEvictedPod(vpaSize int) {
122+
log2 := metrics.GetVpaSizeLog2(vpaSize)
123+
evictedCount.WithLabelValues(strconv.Itoa(log2)).Inc()
124+
}
125+
126+
// Add increases the counter for the given VPA size
127+
func (g *SizeBasedGauge) Add(vpaSize int, value int) {
128+
log2 := metrics.GetVpaSizeLog2(vpaSize)
129+
g.values[log2] += value
130+
}
131+
132+
// Observe stores the recorded values into metrics object associated with the wrapper
133+
func (g *SizeBasedGauge) Observe() {
134+
for log2, value := range g.values {
135+
g.gauge.WithLabelValues(strconv.Itoa(log2)).Set(float64(value))
136+
}
56137
}

0 commit comments

Comments
 (0)