Skip to content

Commit bbc7b34

Browse files
ryanzhang-ossRyan Zhang
andauthored
feat: add more metrics and logs for measurement (#324)
* temp * add more metrics and logs for metrics * address comment Co-authored-by: Ryan Zhang <zhangryan@microsoft.com>
1 parent cfb41a1 commit bbc7b34

File tree

13 files changed

+94
-23
lines changed

13 files changed

+94
-23
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
REGISTRY ?= ghcr.io
2-
KIND_IMAGE ?= kindest/node:v1.23.3
2+
KIND_IMAGE ?= kindest/node:v1.24.6
33
ifndef TAG
44
TAG ?= $(shell git rev-parse --short=7 HEAD)
55
endif

cmd/hubagent/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func init() {
5252
//+kubebuilder:scaffold:scheme
5353
klog.InitFlags(nil)
5454

55-
metrics.Registry.MustRegister(fleetmetrics.JoinResultMetrics, fleetmetrics.LeaveResultMetrics)
55+
metrics.Registry.MustRegister(fleetmetrics.JoinResultMetrics, fleetmetrics.LeaveResultMetrics, fleetmetrics.PlacementApplyFailedCount, fleetmetrics.PlacementApplySucceedCount)
5656
}
5757

5858
func main() {

cmd/memberagent/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func init() {
5757
utilruntime.Must(workv1alpha1.AddToScheme(scheme))
5858
//+kubebuilder:scaffold:scheme
5959

60-
metrics.Registry.MustRegister(fleetmetrics.JoinResultMetrics, fleetmetrics.LeaveResultMetrics)
60+
metrics.Registry.MustRegister(fleetmetrics.JoinResultMetrics, fleetmetrics.LeaveResultMetrics, fleetmetrics.WorkApplyTime)
6161
}
6262

6363
func main() {

pkg/controllers/clusterresourceplacement/placement_controller.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"sigs.k8s.io/controller-runtime/pkg/client"
2323

2424
fleetv1alpha1 "go.goms.io/fleet/apis/v1alpha1"
25+
"go.goms.io/fleet/pkg/metrics"
2526
"go.goms.io/fleet/pkg/utils"
2627
"go.goms.io/fleet/pkg/utils/controller"
2728
"go.goms.io/fleet/pkg/utils/informer"
@@ -59,6 +60,7 @@ type Reconciler struct {
5960
}
6061

6162
func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ctrl.Result, error) {
63+
startTime := time.Now()
6264
name, ok := key.(string)
6365
if !ok {
6466
err := fmt.Errorf("get place key %+v not of type string", key)
@@ -73,9 +75,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ct
7375
}
7476
placeRef := klog.KObj(placementOld)
7577
placementNew := placementOld.DeepCopy()
78+
// add latency log
79+
defer func() {
80+
klog.V(2).InfoS("ClusterResourcePlacement reconciliation loop ends", "placement", placeRef, "latency", time.Since(startTime).Milliseconds())
81+
}()
7682

7783
// TODO: add finalizer logic if we need it in the future
78-
7984
klog.V(2).InfoS("Start to reconcile a ClusterResourcePlacement", "placement", placeRef)
8085
// select the new clusters and record that in the placementNew status
8186
selectedClusters, scheduleErr := r.selectClusters(placementNew)
@@ -120,7 +125,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ct
120125
}
121126
klog.V(2).InfoS("Successfully persisted the intermediate scheduling result", "placement", placementOld.Name,
122127
"totalClusters", totalCluster, "totalResources", totalResources)
123-
// pick up the new version so placementNew can continue to update
128+
// pick up the newly updated schedule condition so that the last schedule time will change every time we run the reconcile loop
129+
meta.SetStatusCondition(&placementNew.Status.Conditions, *placementOld.GetCondition(string(fleetv1alpha1.ResourcePlacementConditionTypeScheduled)))
130+
// pick up the new version so that we can update placementNew without getting it again
124131
placementNew.SetResourceVersion(placementOld.GetResourceVersion())
125132

126133
// schedule works for each cluster by placing them in the cluster scoped namespace
@@ -254,17 +261,17 @@ func (r *Reconciler) updatePlacementScheduledCondition(placement *fleetv1alpha1.
254261
placementRef := klog.KObj(placement)
255262
schedCond := placement.GetCondition(string(fleetv1alpha1.ResourcePlacementConditionTypeScheduled))
256263
if scheduleErr == nil {
264+
if schedCond == nil || schedCond.Status != metav1.ConditionTrue {
265+
klog.V(2).InfoS("successfully scheduled all selected resources to their clusters", "placement", placementRef)
266+
r.Recorder.Event(placement, corev1.EventTypeNormal, "ResourceScheduled", "successfully scheduled all selected resources to their clusters")
267+
}
257268
placement.SetConditions(metav1.Condition{
258269
Status: metav1.ConditionTrue,
259270
Type: string(fleetv1alpha1.ResourcePlacementConditionTypeScheduled),
260271
Reason: "ScheduleSucceeded",
261272
Message: "Successfully scheduled resources for placement",
262273
ObservedGeneration: placement.Generation,
263274
})
264-
if schedCond == nil || schedCond.Status != metav1.ConditionTrue {
265-
klog.V(2).InfoS("successfully scheduled all selected resources to their clusters", "placement", placementRef)
266-
r.Recorder.Event(placement, corev1.EventTypeNormal, "ResourceScheduled", "successfully scheduled all selected resources to their clusters")
267-
}
268275
} else {
269276
placement.SetConditions(metav1.Condition{
270277
Status: metav1.ConditionFalse,
@@ -293,8 +300,9 @@ func (r *Reconciler) updatePlacementAppliedCondition(placement *fleetv1alpha1.Cl
293300
Message: "Successfully applied resources to member clusters",
294301
ObservedGeneration: placement.Generation,
295302
})
296-
klog.V(2).InfoS("successfully applied all selected resources", "placement", placementRef)
297303
if preAppliedCond == nil || preAppliedCond.Status != metav1.ConditionTrue {
304+
klog.V(2).InfoS("successfully applied all selected resources", "placement", placementRef)
305+
metrics.PlacementApplySucceedCount.WithLabelValues(placement.GetName()).Inc()
298306
r.Recorder.Event(placement, corev1.EventTypeNormal, "ResourceApplied", "successfully applied all selected resources")
299307
}
300308
case errors.Is(applyErr, ErrStillPendingManifest):
@@ -305,8 +313,8 @@ func (r *Reconciler) updatePlacementAppliedCondition(placement *fleetv1alpha1.Cl
305313
Message: applyErr.Error(),
306314
ObservedGeneration: placement.Generation,
307315
})
308-
klog.V(2).InfoS("Some selected resources are still waiting to be applied", "placement", placementRef)
309316
if preAppliedCond == nil || preAppliedCond.Status == metav1.ConditionTrue {
317+
klog.V(2).InfoS("Some selected resources are still waiting to be applied", "placement", placementRef)
310318
r.Recorder.Event(placement, corev1.EventTypeWarning, "ResourceApplyPending", "Some applied resources are now waiting to be applied to the member cluster")
311319
}
312320
default:
@@ -318,8 +326,9 @@ func (r *Reconciler) updatePlacementAppliedCondition(placement *fleetv1alpha1.Cl
318326
Message: applyErr.Error(),
319327
ObservedGeneration: placement.Generation,
320328
})
321-
klog.V(2).InfoS("failed to apply some selected resources", "placement", placementRef)
322329
if preAppliedCond == nil || preAppliedCond.Status != metav1.ConditionFalse {
330+
klog.V(2).InfoS("failed to apply some selected resources", "placement", placementRef)
331+
metrics.PlacementApplyFailedCount.WithLabelValues(placement.GetName()).Inc()
323332
r.Recorder.Event(placement, corev1.EventTypeWarning, "ResourceApplyFailed", "failed to apply some selected resources")
324333
}
325334
}

pkg/controllers/clusterresourceplacement/work_propagation.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@ import (
3131
)
3232

3333
const (
34-
LastUpdateAnnotationKey = "work.fleet.azure.com/last-update-time"
35-
SpecHashAnnotationKey = "work.fleet.azure.com/spec-hash-value"
34+
specHashAnnotationKey = "work.fleet.azure.com/spec-hash-value"
3635
)
3736

3837
// scheduleWork creates or updates the work object to reflect the new placement decision.
@@ -63,8 +62,8 @@ func (r *Reconciler) scheduleWork(ctx context.Context, placement *fleetv1alpha1.
6362
utils.LabelFleetObj: utils.LabelFleetObjValue,
6463
}
6564
workAnnotation := map[string]string{
66-
LastUpdateAnnotationKey: time.Now().Format(time.RFC3339),
67-
SpecHashAnnotationKey: specHash,
65+
utils.LastWorkUpdateTimeAnnotationKey: time.Now().Format(time.RFC3339),
66+
specHashAnnotationKey: specHash,
6867
}
6968
changed := false
7069
for _, memberClusterName := range memberClusterNames {
@@ -98,7 +97,7 @@ func (r *Reconciler) scheduleWork(ctx context.Context, placement *fleetv1alpha1.
9897
changed = true
9998
continue
10099
}
101-
existingHash := curWork.GetAnnotations()[SpecHashAnnotationKey]
100+
existingHash := curWork.GetAnnotations()[specHashAnnotationKey]
102101
if existingHash == specHash || reflect.DeepEqual(curWork.Spec.Workload.Manifests, workerSpec.Workload.Manifests) {
103102
klog.V(2).InfoS("skip updating work spec as its identical",
104103
"member cluster namespace", memberClusterNsName, "work name", workName, "number of manifests", len(manifests))

pkg/controllers/membercluster/membercluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ func markMemberClusterLeft(recorder record.EventRecorder, mc apis.ConditionedObj
527527
if existingCondition == nil || existingCondition.Status != newCondition.Status {
528528
recorder.Event(mc, corev1.EventTypeNormal, reasonMemberClusterJoined, "member cluster left")
529529
klog.V(2).InfoS("memberCluster left", "memberCluster", klog.KObj(mc))
530-
metrics.ReportJoinResultMetric()
530+
metrics.ReportLeaveResultMetric()
531531
}
532532

533533
mc.SetConditions(newCondition, notReadyCondition)

pkg/controllers/memberclusterplacement/membercluster_controller.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package memberclusterplacement
88
import (
99
"context"
1010
"fmt"
11+
"time"
1112

1213
apierrors "k8s.io/apimachinery/pkg/api/errors"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -34,20 +35,27 @@ type Reconciler struct {
3435
}
3536

3637
func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ctrl.Result, error) {
38+
startTime := time.Now()
3739
memberClusterName, ok := key.(string)
3840
if !ok {
3941
err := fmt.Errorf("got a resource key %+v not of type cluster wide key", key)
4042
klog.ErrorS(err, "we have encountered a fatal error that can't be retried")
4143
return ctrl.Result{}, err
4244
}
4345

46+
// add latency log
47+
defer func() {
48+
klog.V(2).InfoS("MemberClusterPlacement reconciliation loop ends", "memberCluster", memberClusterName, "latency", time.Since(startTime).Milliseconds())
49+
}()
50+
4451
klog.V(2).InfoS("Start to reconcile a member cluster to enqueue placement events", "memberCluster", memberClusterName)
4552
mObj, err := r.InformerManager.Lister(utils.MemberClusterGVR).Get(memberClusterName)
4653
if err != nil {
4754
klog.ErrorS(err, "failed to get the member cluster", "memberCluster", memberClusterName)
4855
if !apierrors.IsNotFound(err) {
4956
return ctrl.Result{}, err
5057
}
58+
mObj = nil //guard against unexpected informer lib behavior
5159
}
5260
crpList, err := r.InformerManager.Lister(utils.ClusterResourcePlacementGVR).List(labels.Everything())
5361
if err != nil {

pkg/controllers/resourcechange/resourcechange_controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package resourcechange
88
import (
99
"context"
1010
"fmt"
11+
"time"
1112

1213
"github.com/pkg/errors"
1314
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -50,6 +51,7 @@ type Reconciler struct {
5051
}
5152

5253
func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ctrl.Result, error) {
54+
startTime := time.Now()
5355
clusterWideKey, ok := key.(keys.ClusterWideKey)
5456
if !ok {
5557
err := fmt.Errorf("got a resource key %+v not of type cluster wide key", key)
@@ -58,6 +60,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ct
5860
}
5961
klog.V(2).InfoS("Reconciling object", "obj", clusterWideKey)
6062

63+
// add latency log
64+
defer func() {
65+
klog.V(2).InfoS("ResourceChange reconciliation loop ends", "obj", clusterWideKey, "latency", time.Since(startTime).Milliseconds())
66+
}()
67+
6168
// the clusterObj is set to be the object that the placement direct selects,
6269
// in the case of a deleted namespace scoped object, the clusterObj is set to be its parent namespace object.
6370
clusterObj, isClusterScoped, err := r.getUnstructuredObject(clusterWideKey)

pkg/controllers/work/apply_controller.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,10 @@ import (
4242
"sigs.k8s.io/controller-runtime/pkg/controller"
4343
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4444
"sigs.k8s.io/controller-runtime/pkg/predicate"
45-
4645
workv1alpha1 "sigs.k8s.io/work-api/pkg/apis/v1alpha1"
46+
47+
"go.goms.io/fleet/pkg/metrics"
48+
"go.goms.io/fleet/pkg/utils"
4749
)
4850

4951
const (
@@ -139,9 +141,24 @@ func (r *ApplyWorkReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
139141
BlockOwnerDeletion: pointer.Bool(false),
140142
}
141143

142-
// Apply the manifests to the member cluster
144+
// apply the manifests to the member cluster
143145
results := r.applyManifests(ctx, work.Spec.Workload.Manifests, owner)
144146

147+
// collect the latency from the work update time to now.
148+
lastUpdateTime, ok := work.GetAnnotations()[utils.LastWorkUpdateTimeAnnotationKey]
149+
if ok {
150+
workUpdateTime, parseErr := time.Parse(time.RFC3339, lastUpdateTime)
151+
if parseErr != nil {
152+
klog.ErrorS(parseErr, "failed to parse the last work update time", "work", logObjRef)
153+
} else {
154+
latency := time.Since(workUpdateTime)
155+
metrics.WorkApplyTime.WithLabelValues(work.GetName()).Observe(latency.Seconds())
156+
klog.V(2).InfoS("work is applied", "work", work.GetName(), "latency", latency.Milliseconds())
157+
}
158+
} else {
159+
klog.V(2).InfoS("work has no last update time", "work", work.GetName())
160+
}
161+
145162
// generate the work condition based on the manifest apply result
146163
errs := r.generateWorkCondition(results, work)
147164

pkg/metrics/metrics.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ var (
1818
Name: "leave_result_counter",
1919
Help: "Number of successful Leave operations",
2020
}, []string{"result"})
21+
WorkApplyTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
22+
Name: "work_apply_time_seconds",
23+
Help: "Length of time between when a work resource is created/updated to when it is applied on the member cluster",
24+
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0,
25+
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 7, 9, 10, 15, 20, 30, 60, 120},
26+
}, []string{"name"})
27+
PlacementApplyFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
28+
Name: "placement_apply_failed_counter",
29+
Help: "Number of failed to apply cluster resource placement",
30+
}, []string{"name"})
31+
PlacementApplySucceedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
32+
Name: "placement_apply_succeed_counter",
33+
Help: "Number of successfully applied cluster resource placement",
34+
}, []string{"name"})
2135
)
2236

2337
var (

0 commit comments

Comments
 (0)