Skip to content

Commit 164e9fe

Browse files
authored
Merge pull request kubernetes#94124 from nilo19/feature/add-operation-metrics
Add metrics for azure service operations (route and loadbalancer).
2 parents 16ea9dc + 88c72cc commit 164e9fe

File tree

6 files changed

+133
-14
lines changed

6 files changed

+133
-14
lines changed

staging/src/k8s.io/legacy-cloud-providers/azure/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ go_library(
9191
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient:go_default_library",
9292
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient:go_default_library",
9393
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient:go_default_library",
94+
"//staging/src/k8s.io/legacy-cloud-providers/azure/metrics:go_default_library",
9495
"//staging/src/k8s.io/legacy-cloud-providers/azure/retry:go_default_library",
9596
"//vendor/github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute:go_default_library",
9697
"//vendor/github.com/Azure/azure-sdk-for-go/services/network/mgmt/2019-06-01/network:go_default_library",

staging/src/k8s.io/legacy-cloud-providers/azure/azure_loadbalancer.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
servicehelpers "k8s.io/cloud-provider/service/helpers"
3737
"k8s.io/klog/v2"
3838
azcache "k8s.io/legacy-cloud-providers/azure/cache"
39+
"k8s.io/legacy-cloud-providers/azure/metrics"
3940
"k8s.io/legacy-cloud-providers/azure/retry"
4041
utilnet "k8s.io/utils/net"
4142
)
@@ -157,6 +158,12 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser
157158
serviceName := getServiceName(service)
158159
klog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q", serviceName, clusterName)
159160

161+
mc := metrics.NewMetricContext("services", "ensure_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName)
162+
isOperationSucceeded := false
163+
defer func() {
164+
mc.ObserveOperationWithResult(isOperationSucceeded)
165+
}()
166+
160167
lb, err := az.reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */)
161168
if err != nil {
162169
klog.Errorf("reconcileLoadBalancer(%s) failed: %v", serviceName, err)
@@ -192,6 +199,8 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser
192199
return nil, err
193200
}
194201

202+
isOperationSucceeded = true
203+
195204
return lbStatus, nil
196205
}
197206

@@ -216,6 +225,12 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri
216225
serviceName := getServiceName(service)
217226
klog.V(5).Infof("Delete service (%s): START clusterName=%q", serviceName, clusterName)
218227

228+
mc := metrics.NewMetricContext("services", "ensure_loadbalancer_deleted", az.ResourceGroup, az.SubscriptionID, serviceName)
229+
isOperationSucceeded := false
230+
defer func() {
231+
mc.ObserveOperationWithResult(isOperationSucceeded)
232+
}()
233+
219234
serviceIPToCleanup, err := az.findServiceIPAddress(ctx, clusterName, service, isInternal)
220235
if err != nil && !retry.HasStatusForbiddenOrIgnoredError(err) {
221236
return err
@@ -235,6 +250,8 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri
235250
}
236251

237252
klog.V(2).Infof("Delete service (%s): FINISH", serviceName)
253+
isOperationSucceeded = true
254+
238255
return nil
239256
}
240257

staging/src/k8s.io/legacy-cloud-providers/azure/azure_routes.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
cloudprovider "k8s.io/cloud-provider"
3434
"k8s.io/klog/v2"
3535
azcache "k8s.io/legacy-cloud-providers/azure/cache"
36+
"k8s.io/legacy-cloud-providers/azure/metrics"
3637
utilnet "k8s.io/utils/net"
3738
)
3839

@@ -282,6 +283,12 @@ func (az *Cloud) createRouteTable() error {
282283
// route.Name will be ignored, although the cloud-provider may use nameHint
283284
// to create a more user-meaningful name.
284285
func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint string, kubeRoute *cloudprovider.Route) error {
286+
mc := metrics.NewMetricContext("routes", "create_route", az.ResourceGroup, az.SubscriptionID, "")
287+
isOperationSucceeded := false
288+
defer func() {
289+
mc.ObserveOperationWithResult(isOperationSucceeded)
290+
}()
291+
285292
// Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them.
286293
var targetIP string
287294
nodeName := string(kubeRoute.TargetNode)
@@ -351,12 +358,20 @@ func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint s
351358
}
352359

353360
klog.V(2).Infof("CreateRoute: route created. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR)
361+
isOperationSucceeded = true
362+
354363
return nil
355364
}
356365

357366
// DeleteRoute deletes the specified managed route
358367
// Route should be as returned by ListRoutes
359368
func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute *cloudprovider.Route) error {
369+
mc := metrics.NewMetricContext("routes", "delete_route", az.ResourceGroup, az.SubscriptionID, "")
370+
isOperationSucceeded := false
371+
defer func() {
372+
mc.ObserveOperationWithResult(isOperationSucceeded)
373+
}()
374+
360375
// Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them.
361376
nodeName := string(kubeRoute.TargetNode)
362377
unmanaged, err := az.IsNodeUnmanaged(nodeName)
@@ -392,6 +407,8 @@ func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute
392407
}
393408

394409
klog.V(2).Infof("DeleteRoute: route deleted. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR)
410+
isOperationSucceeded = true
411+
395412
return nil
396413
}
397414

staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ import (
3838
"k8s.io/apimachinery/pkg/util/sets"
3939
"k8s.io/apimachinery/pkg/util/uuid"
4040
cloudprovider "k8s.io/cloud-provider"
41+
"k8s.io/component-base/featuregate"
4142
"k8s.io/klog/v2"
4243
azcache "k8s.io/legacy-cloud-providers/azure/cache"
43-
44-
"k8s.io/component-base/featuregate"
44+
"k8s.io/legacy-cloud-providers/azure/metrics"
4545
utilnet "k8s.io/utils/net"
4646
)
4747

@@ -808,6 +808,12 @@ func (as *availabilitySet) EnsureHostInPool(service *v1.Service, nodeName types.
808808
// EnsureHostsInPool ensures the given Node's primary IP configurations are
809809
// participating in the specified LoadBalancer Backend Pool.
810810
func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error {
811+
mc := metrics.NewMetricContext("services", "vmas_ensure_hosts_in_pool", as.ResourceGroup, as.SubscriptionID, service.Name)
812+
isOperationSucceeded := false
813+
defer func() {
814+
mc.ObserveOperationWithResult(isOperationSucceeded)
815+
}()
816+
811817
hostUpdates := make([]func() error, 0, len(nodes))
812818
for _, node := range nodes {
813819
localNodeName := node.Name
@@ -836,6 +842,7 @@ func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.No
836842
return utilerrors.Flatten(errs)
837843
}
838844

845+
isOperationSucceeded = true
839846
return nil
840847
}
841848

staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
cloudprovider "k8s.io/cloud-provider"
3838
"k8s.io/klog/v2"
3939
azcache "k8s.io/legacy-cloud-providers/azure/cache"
40+
"k8s.io/legacy-cloud-providers/azure/metrics"
4041
utilnet "k8s.io/utils/net"
4142
)
4243

@@ -1183,6 +1184,12 @@ func (ss *scaleSet) ensureVMSSInPool(service *v1.Service, nodes []*v1.Node, back
11831184
// EnsureHostsInPool ensures the given Node's primary IP configurations are
11841185
// participating in the specified LoadBalancer Backend Pool.
11851186
func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error {
1187+
mc := metrics.NewMetricContext("services", "vmss_ensure_hosts_in_pool", ss.ResourceGroup, ss.SubscriptionID, service.Name)
1188+
isOperationSucceeded := false
1189+
defer func() {
1190+
mc.ObserveOperationWithResult(isOperationSucceeded)
1191+
}()
1192+
11861193
hostUpdates := make([]func() error, 0, len(nodes))
11871194
nodeUpdates := make(map[vmssMetaInfo]map[string]compute.VirtualMachineScaleSetVM)
11881195
errors := make([]error, 0)
@@ -1281,6 +1288,7 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
12811288
return err
12821289
}
12831290

1291+
isOperationSucceeded = true
12841292
return nil
12851293
}
12861294

@@ -1484,6 +1492,12 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
14841492
return nil
14851493
}
14861494

1495+
mc := metrics.NewMetricContext("services", "vmss_ensure_backend_pool_deleted", ss.ResourceGroup, ss.SubscriptionID, service.Name)
1496+
isOperationSucceeded := false
1497+
defer func() {
1498+
mc.ObserveOperationWithResult(isOperationSucceeded)
1499+
}()
1500+
14871501
ipConfigurationIDs := []string{}
14881502
for _, backendPool := range *backendAddressPools {
14891503
if strings.EqualFold(*backendPool.ID, backendPoolID) && backendPool.BackendIPConfigurations != nil {
@@ -1582,5 +1596,6 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
15821596
return err
15831597
}
15841598

1599+
isOperationSucceeded = true
15851600
return nil
15861601
}

staging/src/k8s.io/legacy-cloud-providers/azure/metrics/azure_metrics.go

Lines changed: 74 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,38 @@ import (
2626
"k8s.io/component-base/metrics/legacyregistry"
2727
)
2828

29-
type apiCallMetrics struct {
30-
latency *metrics.HistogramVec
31-
errors *metrics.CounterVec
32-
rateLimitedCount *metrics.CounterVec
33-
throttledCount *metrics.CounterVec
34-
}
29+
const (
30+
azureMetricsNamespace = "cloudprovider_azure"
31+
)
3532

3633
var (
3734
metricLabels = []string{
3835
"request", // API function that is being invoked
3936
"resource_group", // Resource group of the resource being monitored
4037
"subscription_id", // Subscription ID of the resource being monitored
41-
"source", // Oeration source(optional)
38+
"source", // Operation source(optional)
4239
}
4340

44-
apiMetrics = registerAPIMetrics(metricLabels...)
41+
apiMetrics = registerAPIMetrics(metricLabels...)
42+
operationMetrics = registerOperationMetrics(metricLabels...)
4543
)
4644

45+
// apiCallMetrics is the metrics measuring the performance of a single API call
46+
// e.g., GET, POST ...
47+
type apiCallMetrics struct {
48+
latency *metrics.HistogramVec
49+
errors *metrics.CounterVec
50+
rateLimitedCount *metrics.CounterVec
51+
throttledCount *metrics.CounterVec
52+
}
53+
54+
// operationCallMetrics is the metrics measuring the performance of a whole operation
55+
// e.g., the create / update / delete process of a loadbalancer or route.
56+
type operationCallMetrics struct {
57+
operationLatency *metrics.HistogramVec
58+
operationFailureCount *metrics.CounterVec
59+
}
60+
4761
// MetricContext indicates the context for Azure client metrics.
4862
type MetricContext struct {
4963
start time.Time
@@ -79,36 +93,54 @@ func (mc *MetricContext) Observe(err error) error {
7993
return err
8094
}
8195

96+
// ObserveOperationWithResult observes the request latency and failed requests of an operation.
97+
func (mc *MetricContext) ObserveOperationWithResult(isOperationSucceeded bool) {
98+
operationMetrics.operationLatency.WithLabelValues(mc.attributes...).Observe(
99+
time.Since(mc.start).Seconds())
100+
if !isOperationSucceeded {
101+
mc.CountFailedOperation()
102+
}
103+
}
104+
105+
// CountFailedOperation increase the number of failed operations
106+
func (mc *MetricContext) CountFailedOperation() {
107+
operationMetrics.operationFailureCount.WithLabelValues(mc.attributes...).Inc()
108+
}
109+
82110
// registerAPIMetrics registers the API metrics.
83111
func registerAPIMetrics(attributes ...string) *apiCallMetrics {
84112
metrics := &apiCallMetrics{
85113
latency: metrics.NewHistogramVec(
86114
&metrics.HistogramOpts{
87-
Name: "cloudprovider_azure_api_request_duration_seconds",
115+
Namespace: azureMetricsNamespace,
116+
Name: "api_request_duration_seconds",
88117
Help: "Latency of an Azure API call",
89118
StabilityLevel: metrics.ALPHA,
90119
},
91120
attributes,
92121
),
93122
errors: metrics.NewCounterVec(
94123
&metrics.CounterOpts{
95-
Name: "cloudprovider_azure_api_request_errors",
124+
Namespace: azureMetricsNamespace,
125+
Name: "api_request_errors",
96126
Help: "Number of errors for an Azure API call",
97127
StabilityLevel: metrics.ALPHA,
98128
},
99129
attributes,
100130
),
101131
rateLimitedCount: metrics.NewCounterVec(
102132
&metrics.CounterOpts{
103-
Name: "cloudprovider_azure_api_request_ratelimited_count",
133+
Namespace: azureMetricsNamespace,
134+
Name: "api_request_ratelimited_count",
104135
Help: "Number of rate limited Azure API calls",
105136
StabilityLevel: metrics.ALPHA,
106137
},
107138
attributes,
108139
),
109140
throttledCount: metrics.NewCounterVec(
110141
&metrics.CounterOpts{
111-
Name: "cloudprovider_azure_api_request_throttled_count",
142+
Namespace: azureMetricsNamespace,
143+
Name: "api_request_throttled_count",
112144
Help: "Number of throttled Azure API calls",
113145
StabilityLevel: metrics.ALPHA,
114146
},
@@ -123,3 +155,33 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
123155

124156
return metrics
125157
}
158+
159+
// registerOperationMetrics registers the operation metrics.
160+
func registerOperationMetrics(attributes ...string) *operationCallMetrics {
161+
metrics := &operationCallMetrics{
162+
operationLatency: metrics.NewHistogramVec(
163+
&metrics.HistogramOpts{
164+
Namespace: azureMetricsNamespace,
165+
Name: "op_duration_seconds",
166+
Help: "Latency of an Azure service operation",
167+
StabilityLevel: metrics.ALPHA,
168+
Buckets: []float64{0.1, 0.2, 0.5, 1, 10, 20, 30, 40, 50, 60, 100, 200, 300},
169+
},
170+
attributes,
171+
),
172+
operationFailureCount: metrics.NewCounterVec(
173+
&metrics.CounterOpts{
174+
Namespace: azureMetricsNamespace,
175+
Name: "op_failure_count",
176+
Help: "Number of failed Azure service operations",
177+
StabilityLevel: metrics.ALPHA,
178+
},
179+
attributes,
180+
),
181+
}
182+
183+
legacyregistry.MustRegister(metrics.operationLatency)
184+
legacyregistry.MustRegister(metrics.operationFailureCount)
185+
186+
return metrics
187+
}

0 commit comments

Comments
 (0)