Add a metric for observing recommendation stability & changes.

kgolab · kgolab · commit ed491861de64 · 2020-06-01T08:37:24.000+02:00
diff --git a/vertical-pod-autoscaler/pkg/recommender/model/cluster.go b/vertical-pod-autoscaler/pkg/recommender/model/cluster.go
@@ -46,8 +46,6 @@ type ClusterState struct {
 	// time we've noticed the recommendation missing or last time we logged
 	// a warning about it.
 	EmptyVPAs map[VpaID]time.Time
-	// VpaPodCount contains number of live Pods matching a given VPA object.
-	VpaPodCount map[VpaID]int
 	// Observed VPAs. Used to check if there are updates needed.
 	ObservedVpas []*vpa_types.VerticalPodAutoscaler
 
@@ -99,7 +97,6 @@ func NewClusterState() *ClusterState {
 		Pods:              make(map[PodID]*PodState),
 		Vpas:              make(map[VpaID]*Vpa),
 		EmptyVPAs:         make(map[VpaID]time.Time),
-		VpaPodCount:       make(map[VpaID]int),
 		aggregateStateMap: make(aggregateContainerStatesMap),
 		labelSetMap:       make(labelSetMap),
 	}
@@ -147,7 +144,7 @@ func (cluster *ClusterState) AddOrUpdatePod(podID PodID, newLabels labels.Set, p
 func (cluster *ClusterState) addPodToItsVpa(pod *PodState) {
 	for _, vpa := range cluster.Vpas {
 		if vpa_utils.PodLabelsMatchVPA(pod.ID.Namespace, cluster.labelSetMap[pod.labelSetKey], vpa.ID.Namespace, vpa.PodSelector) {
-			cluster.VpaPodCount[vpa.ID]++
+			vpa.PodCount++
 		}
 	}
 }
@@ -156,7 +153,7 @@ func (cluster *ClusterState) addPodToItsVpa(pod *PodState) {
 func (cluster *ClusterState) removePodFromItsVpa(pod *PodState) {
 	for _, vpa := range cluster.Vpas {
 		if vpa_utils.PodLabelsMatchVPA(pod.ID.Namespace, cluster.labelSetMap[pod.labelSetKey], vpa.ID.Namespace, vpa.PodSelector) {
-			cluster.VpaPodCount[vpa.ID]--
+			vpa.PodCount--
 		}
 	}
 }
@@ -268,7 +265,7 @@ func (cluster *ClusterState) AddOrUpdateVpa(apiObject *vpa_types.VerticalPodAuto
 		for aggregationKey, aggregation := range cluster.aggregateStateMap {
 			vpa.UseAggregationIfMatching(aggregationKey, aggregation)
 		}
-		cluster.VpaPodCount[vpaID] = len(cluster.GetMatchingPods(vpa))
+		vpa.PodCount = len(cluster.GetMatchingPods(vpa))
 	}
 	vpa.TargetRef = apiObject.Spec.TargetRef
 	vpa.Annotations = annotationsMap
@@ -290,7 +287,6 @@ func (cluster *ClusterState) DeleteVpa(vpaID VpaID) error {
 	}
 	delete(cluster.Vpas, vpaID)
 	delete(cluster.EmptyVPAs, vpaID)
-	delete(cluster.VpaPodCount, vpaID)
 	return nil
 }
 
diff --git a/vertical-pod-autoscaler/pkg/recommender/model/cluster_test.go b/vertical-pod-autoscaler/pkg/recommender/model/cluster_test.go
@@ -768,7 +768,7 @@ func TestVPAWithMatchingPods(t *testing.T) {
 				containerID := ContainerID{testPodID, "foo"}
 				assert.NoError(t, cluster.AddOrUpdateContainer(containerID, testRequest))
 			}
-			assert.Equal(t, tc.expectedMatch, cluster.VpaPodCount[vpa.ID])
+			assert.Equal(t, tc.expectedMatch, cluster.Vpas[vpa.ID].PodCount)
 		})
 	}
 	// Run with adding Pods first
@@ -781,7 +781,7 @@ func TestVPAWithMatchingPods(t *testing.T) {
 				assert.NoError(t, cluster.AddOrUpdateContainer(containerID, testRequest))
 			}
 			vpa := addVpa(cluster, testVpaID, testAnnotations, tc.vpaSelector)
-			assert.Equal(t, tc.expectedMatch, cluster.VpaPodCount[vpa.ID])
+			assert.Equal(t, tc.expectedMatch, cluster.Vpas[vpa.ID].PodCount)
 		})
 	}
 }
diff --git a/vertical-pod-autoscaler/pkg/recommender/model/vpa.go b/vertical-pod-autoscaler/pkg/recommender/model/vpa.go
@@ -25,6 +25,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
+	metrics_quality "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics/quality"
 	vpa_api_util "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/vpa"
 )
 
@@ -107,6 +108,8 @@ type Vpa struct {
 	IsV1Beta1API bool
 	// TargetRef points to the controller managing the set of pods.
 	TargetRef *autoscaling.CrossVersionObjectReference
+	// PodCount contains number of live Pods matching a given VPA object.
+	PodCount int
 }
 
 // NewVpa returns a new Vpa with a given ID and pod selector. Doesn't set the
@@ -121,6 +124,7 @@ func NewVpa(id VpaID, selector labels.Selector, created time.Time) *Vpa {
 		Annotations:                     make(vpaAnnotationsMap),
 		Conditions:                      make(vpaConditionsMap),
 		IsV1Beta1API:                    false,
+		PodCount:                        0,
 	}
 	return vpa
 }
@@ -143,14 +147,15 @@ func (vpa *Vpa) UseAggregationIfMatching(aggregationKey AggregateStateKey, aggre
 // UpdateRecommendation updates the recommended resources in the VPA and its
 // aggregations with the given recommendation.
 func (vpa *Vpa) UpdateRecommendation(recommendation *vpa_types.RecommendedPodResources) {
-	vpa.Recommendation = recommendation
 	for _, containerRecommendation := range recommendation.ContainerRecommendations {
 		for container, state := range vpa.aggregateContainerStates {
 			if container.ContainerName() == containerRecommendation.ContainerName {
+				metrics_quality.ObserveRecommendationChange(state.LastRecommendation, containerRecommendation.UncappedTarget, vpa.UpdateMode, vpa.PodCount)
 				state.LastRecommendation = containerRecommendation.UncappedTarget
 			}
 		}
 	}
+	vpa.Recommendation = recommendation
 }
 
 // UsesAggregation returns true iff an aggregation with the given key contributes to the VPA.
diff --git a/vertical-pod-autoscaler/pkg/recommender/routines/recommender.go b/vertical-pod-autoscaler/pkg/recommender/routines/recommender.go
@@ -103,18 +103,17 @@ func (r *recommender) UpdateVPAs() {
 		if vpa.HasRecommendation() && !had {
 			metrics_recommender.ObserveRecommendationLatency(vpa.Created)
 		}
-		hasMatchingPods := r.clusterState.VpaPodCount[vpa.ID] > 0
+		hasMatchingPods := vpa.PodCount > 0
 		vpa.UpdateConditions(hasMatchingPods)
 		if err := r.clusterState.RecordRecommendation(vpa, time.Now()); err != nil {
 			klog.Warningf("%v", err)
 			klog.V(4).Infof("VPA dump")
 			klog.V(4).Infof("%+v", vpa)
 			klog.V(4).Infof("HasMatchingPods: %v", hasMatchingPods)
-			podCount := r.clusterState.VpaPodCount[vpa.ID]
-			klog.V(4).Infof("VpaPodCount: %v", podCount)
+			klog.V(4).Infof("PodCount: %v", vpa.PodCount)
 			pods := r.clusterState.GetMatchingPods(vpa)
 			klog.V(4).Infof("MatchingPods: %+v", pods)
-			if len(pods) != podCount {
+			if len(pods) != vpa.PodCount {
 				klog.Errorf("ClusterState pod count and matching pods disagree for vpa %v/%v", vpa.ID.Namespace, vpa.ID.VpaName)
 			}
 		}
diff --git a/vertical-pod-autoscaler/pkg/utils/metrics/quality/quality.go b/vertical-pod-autoscaler/pkg/utils/metrics/quality/quality.go
@@ -22,6 +22,7 @@ import (
 	"strconv"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics"
 	"k8s.io/klog"
@@ -38,6 +39,9 @@ var (
 	cpuBuckets = prometheus.ExponentialBuckets(0.01, 2., 17)
 	// Buckets between 1MB and 65.5 GB
 	memoryBuckets = prometheus.ExponentialBuckets(1e6, 2., 17)
+	// Buckets for relative comparisons, from -100% to x100
+	relativeBuckets = []float64{-1., -.75, -.5, -.25, -.1, -.05, -0.025, -.01, -.005, -0.0025, -.001,
+		0., .001, .0025, .005, .01, .025, .05, .1, .25, .5, .75, 1., 2.5, 5., 10., 25., 50., 100.}
 )
 
 var (
@@ -46,8 +50,7 @@ var (
 			Namespace: metricsNamespace,
 			Name:      "usage_recommendation_relative_diffs",
 			Help:      "Diffs between recommendation and usage, normalized by recommendation value",
-			Buckets: []float64{-1., -.75, -.5, -.25, -.1, -.05, -0.025, -.01, -.005, -0.0025, -.001, 0.,
-				.001, .0025, .005, .01, .025, .05, .1, .25, .5, .75, 1., 2.5, 5., 10., 25., 50., 100.},
+			Buckets:   relativeBuckets,
 		}, []string{"update_mode", "resource", "is_oom"},
 	)
 	usageMissingRecommendationCounter = prometheus.NewCounterVec(
@@ -105,6 +108,14 @@ var (
 			Buckets:   memoryBuckets,
 		}, []string{"update_mode", "is_oom"},
 	)
+	relativeRecommendationChange = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: metricsNamespace,
+			Name:      "relative_recommendation_changes",
+			Help:      "Changes between consecutive recommendation values, normalized by old value",
+			Buckets:   relativeBuckets,
+		}, []string{"update_mode", "resource", "vpa_size_log2"},
+	)
 )
 
 // Register initializes all VPA quality metrics
@@ -117,6 +128,7 @@ func Register() {
 	prometheus.MustRegister(memoryRecommendationLowerOrEqualUsageDiff)
 	prometheus.MustRegister(cpuRecommendations)
 	prometheus.MustRegister(memoryRecommendations)
+	prometheus.MustRegister(relativeRecommendationChange)
 }
 
 // observeUsageRecommendationRelativeDiff records relative diff between usage and
@@ -184,6 +196,44 @@ func ObserveQualityMetricsRecommendationMissing(usage float64, isOOM bool, resou
 	observeUsageRecommendationDiff(usage, 0, true, isOOM, resource, updateMode)
 }
 
+// ObserveRecommendationChange records relative_recommendation_changes metric.
+func ObserveRecommendationChange(previous, current corev1.ResourceList, updateMode *vpa_types.UpdateMode, vpaSize int) {
+	// This will happen if there is no previous recommendation, we don't want to emit anything then.
+	if previous == nil {
+		return
+	}
+	// This is not really expected thus a warning.
+	if current == nil {
+		klog.Warningf("Cannot compare with current recommendation being nil. VPA mode: %v, size: %v", updateMode, vpaSize)
+		return
+	}
+
+	for resource, amount := range current {
+		newValue := quantityAsFloat(resource, amount)
+		oldValue := quantityAsFloat(resource, previous[resource])
+
+		if oldValue > 0.0 {
+			diff := newValue/oldValue - 1.0 // -1.0 to report decreases as negative values and keep 0.0 neutral
+			relativeRecommendationChange.WithLabelValues(updateModeToString(updateMode), string(resource), string(vpaSize)).Observe(diff)
+		} else {
+			klog.Warningf("Cannot compare as old recommendation for %v is 0. VPA mode: %v, size: %v", resource, updateMode, vpaSize)
+		}
+	}
+}
+
+// quantityAsFloat converts resource.Quantity to a float64 value, in some scale (constant per resource but unspecified)
+func quantityAsFloat(resource corev1.ResourceName, quantity resource.Quantity) float64 {
+	switch resource {
+	case corev1.ResourceCPU:
+		return float64(quantity.MilliValue())
+	case corev1.ResourceMemory:
+		return float64(quantity.Value())
+	default:
+		klog.Warningf("Unknown resource: %v", resource)
+		return 0.0
+	}
+}
+
 func updateModeToString(updateMode *vpa_types.UpdateMode) string {
 	if updateMode == nil {
 		return ""

Original file line number	Diff line number	Diff line change
`@@ -46,8 +46,6 @@ type ClusterState struct {`
`46`	`46`	`// time we've noticed the recommendation missing or last time we logged`
`47`	`47`	`// a warning about it.`
`48`	`48`	`EmptyVPAs map[VpaID]time.Time`
`49`		`- // VpaPodCount contains number of live Pods matching a given VPA object.`
`50`		`- VpaPodCount map[VpaID]int`
`51`	`49`	`// Observed VPAs. Used to check if there are updates needed.`
`52`	`50`	`ObservedVpas []*vpa_types.VerticalPodAutoscaler`
`53`	`51`
`@@ -99,7 +97,6 @@ func NewClusterState() *ClusterState {`
`99`	`97`	`Pods: make(map[PodID]*PodState),`
`100`	`98`	`Vpas: make(map[VpaID]*Vpa),`
`101`	`99`	`EmptyVPAs: make(map[VpaID]time.Time),`
`102`		`- VpaPodCount: make(map[VpaID]int),`
`103`	`100`	`aggregateStateMap: make(aggregateContainerStatesMap),`
`104`	`101`	`labelSetMap: make(labelSetMap),`
`105`	`102`	`}`
`@@ -147,7 +144,7 @@ func (cluster *ClusterState) AddOrUpdatePod(podID PodID, newLabels labels.Set, p`
`147`	`144`	`func (cluster ClusterState) addPodToItsVpa(pod PodState) {`
`148`	`145`	`for _, vpa := range cluster.Vpas {`
`149`	`146`	`if vpa_utils.PodLabelsMatchVPA(pod.ID.Namespace, cluster.labelSetMap[pod.labelSetKey], vpa.ID.Namespace, vpa.PodSelector) {`
`150`		`- cluster.VpaPodCount[vpa.ID]++`
	`147`	`+ vpa.PodCount++`
`151`	`148`	`}`
`152`	`149`	`}`
`153`	`150`	`}`
`@@ -156,7 +153,7 @@ func (cluster ClusterState) addPodToItsVpa(pod PodState) {`
`156`	`153`	`func (cluster ClusterState) removePodFromItsVpa(pod PodState) {`
`157`	`154`	`for _, vpa := range cluster.Vpas {`
`158`	`155`	`if vpa_utils.PodLabelsMatchVPA(pod.ID.Namespace, cluster.labelSetMap[pod.labelSetKey], vpa.ID.Namespace, vpa.PodSelector) {`
`159`		`- cluster.VpaPodCount[vpa.ID]--`
	`156`	`+ vpa.PodCount--`
`160`	`157`	`}`
`161`	`158`	`}`
`162`	`159`	`}`
`@@ -268,7 +265,7 @@ func (cluster ClusterState) AddOrUpdateVpa(apiObject vpa_types.VerticalPodAuto`
`268`	`265`	`for aggregationKey, aggregation := range cluster.aggregateStateMap {`
`269`	`266`	`vpa.UseAggregationIfMatching(aggregationKey, aggregation)`
`270`	`267`	`}`
`271`		`- cluster.VpaPodCount[vpaID] = len(cluster.GetMatchingPods(vpa))`
	`268`	`+ vpa.PodCount = len(cluster.GetMatchingPods(vpa))`
`272`	`269`	`}`
`273`	`270`	`vpa.TargetRef = apiObject.Spec.TargetRef`
`274`	`271`	`vpa.Annotations = annotationsMap`
`@@ -290,7 +287,6 @@ func (cluster *ClusterState) DeleteVpa(vpaID VpaID) error {`
`290`	`287`	`}`
`291`	`288`	`delete(cluster.Vpas, vpaID)`
`292`	`289`	`delete(cluster.EmptyVPAs, vpaID)`
`293`		`- delete(cluster.VpaPodCount, vpaID)`
`294`	`290`	`return nil`
`295`	`291`	`}`
`296`	`292`
Original file line number	Diff line number	Diff line change
`@@ -768,7 +768,7 @@ func TestVPAWithMatchingPods(t *testing.T) {`
`768`	`768`	`containerID := ContainerID{testPodID, "foo"}`
`769`	`769`	`assert.NoError(t, cluster.AddOrUpdateContainer(containerID, testRequest))`
`770`	`770`	`}`
`771`		`- assert.Equal(t, tc.expectedMatch, cluster.VpaPodCount[vpa.ID])`
	`771`	`+ assert.Equal(t, tc.expectedMatch, cluster.Vpas[vpa.ID].PodCount)`
`772`	`772`	`})`
`773`	`773`	`}`
`774`	`774`	`// Run with adding Pods first`
`@@ -781,7 +781,7 @@ func TestVPAWithMatchingPods(t *testing.T) {`
`781`	`781`	`assert.NoError(t, cluster.AddOrUpdateContainer(containerID, testRequest))`
`782`	`782`	`}`
`783`	`783`	`vpa := addVpa(cluster, testVpaID, testAnnotations, tc.vpaSelector)`
`784`		`- assert.Equal(t, tc.expectedMatch, cluster.VpaPodCount[vpa.ID])`
	`784`	`+ assert.Equal(t, tc.expectedMatch, cluster.Vpas[vpa.ID].PodCount)`
`785`	`785`	`})`
`786`	`786`	`}`
`787`	`787`	`}`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ import (`
`25`	`25`	`metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"`
`26`	`26`	`"k8s.io/apimachinery/pkg/labels"`
`27`	`27`	`vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"`
	`28`	`+ metrics_quality "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics/quality"`
`28`	`29`	`vpa_api_util "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/vpa"`
`29`	`30`	`)`
`30`	`31`
`@@ -107,6 +108,8 @@ type Vpa struct {`
`107`	`108`	`IsV1Beta1API bool`
`108`	`109`	`// TargetRef points to the controller managing the set of pods.`
`109`	`110`	`TargetRef *autoscaling.CrossVersionObjectReference`
	`111`	`+ // PodCount contains number of live Pods matching a given VPA object.`
	`112`	`+ PodCount int`
`110`	`113`	`}`
`111`	`114`
`112`	`115`	`// NewVpa returns a new Vpa with a given ID and pod selector. Doesn't set the`
`@@ -121,6 +124,7 @@ func NewVpa(id VpaID, selector labels.Selector, created time.Time) *Vpa {`
`121`	`124`	`Annotations: make(vpaAnnotationsMap),`
`122`	`125`	`Conditions: make(vpaConditionsMap),`
`123`	`126`	`IsV1Beta1API: false,`
	`127`	`+ PodCount: 0,`
`124`	`128`	`}`
`125`	`129`	`return vpa`
`126`	`130`	`}`
`@@ -143,14 +147,15 @@ func (vpa *Vpa) UseAggregationIfMatching(aggregationKey AggregateStateKey, aggre`
`143`	`147`	`// UpdateRecommendation updates the recommended resources in the VPA and its`
`144`	`148`	`// aggregations with the given recommendation.`
`145`	`149`	`func (vpa Vpa) UpdateRecommendation(recommendation vpa_types.RecommendedPodResources) {`
`146`		`- vpa.Recommendation = recommendation`
`147`	`150`	`for _, containerRecommendation := range recommendation.ContainerRecommendations {`
`148`	`151`	`for container, state := range vpa.aggregateContainerStates {`
`149`	`152`	`if container.ContainerName() == containerRecommendation.ContainerName {`
	`153`	`+ metrics_quality.ObserveRecommendationChange(state.LastRecommendation, containerRecommendation.UncappedTarget, vpa.UpdateMode, vpa.PodCount)`
`150`	`154`	`state.LastRecommendation = containerRecommendation.UncappedTarget`
`151`	`155`	`}`
`152`	`156`	`}`
`153`	`157`	`}`
	`158`	`+ vpa.Recommendation = recommendation`
`154`	`159`	`}`
`155`	`160`
`156`	`161`	`// UsesAggregation returns true iff an aggregation with the given key contributes to the VPA.`