Skip to content

Commit 1874289

Browse files
authored
fix: fix updateRun stuck with reportDiff mode (#81)
1 parent c036a6f commit 1874289

File tree

10 files changed

+830
-650
lines changed

10 files changed

+830
-650
lines changed

pkg/controllers/clusterresourceplacementeviction/controller_intergration_test.go

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222

2323
. "github.com/onsi/ginkgo/v2"
2424
. "github.com/onsi/gomega"
25-
"github.com/prometheus/client_golang/prometheus"
2625
prometheusclientmodel "github.com/prometheus/client_model/go"
2726
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2827
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -31,6 +30,7 @@ import (
3130
"k8s.io/apimachinery/pkg/util/intstr"
3231
"k8s.io/utils/ptr"
3332
"sigs.k8s.io/controller-runtime/pkg/client"
33+
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
3434

3535
placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
3636
"github.com/kubefleet-dev/kubefleet/pkg/utils/condition"
@@ -55,12 +55,8 @@ const (
5555
var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
5656
crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess())
5757
evictionName := fmt.Sprintf(evictionNameTemplate, GinkgoParallelProcess())
58-
var customRegistry *prometheus.Registry
5958

6059
BeforeEach(func() {
61-
// Create a test registry
62-
customRegistry = prometheus.NewRegistry()
63-
Expect(customRegistry.Register(metrics.FleetEvictionStatus)).Should(Succeed())
6460
// Reset metrics before each test
6561
metrics.FleetEvictionStatus.Reset()
6662
// emit incomplete eviction metric to simulate eviction failed once.
@@ -72,7 +68,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
7268
ensureAllBindingsAreRemoved(crpName)
7369
ensureEvictionRemoved(evictionName)
7470
ensureCRPRemoved(crpName)
75-
Expect(customRegistry.Unregister(metrics.FleetEvictionStatus)).Should(BeTrue())
71+
metrics.FleetEvictionStatus.Reset()
7672
})
7773

7874
It("Invalid Eviction Blocked - emit complete metric with isValid=false, isComplete=true", func() {
@@ -90,7 +86,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
9086
})
9187

9288
By("Ensure eviction complete metric was emitted", func() {
93-
checkEvictionCompleteMetric(customRegistry, "false", "true")
89+
checkEvictionCompleteMetric("false", "true")
9490
})
9591
})
9692

@@ -169,7 +165,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
169165
})
170166

171167
By("Ensure eviction complete metric was emitted", func() {
172-
checkEvictionCompleteMetric(customRegistry, "true", "true")
168+
checkEvictionCompleteMetric("true", "true")
173169
})
174170
})
175171

@@ -269,7 +265,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
269265
})
270266

271267
By("Ensure eviction complete metric was emitted", func() {
272-
checkEvictionCompleteMetric(customRegistry, "true", "true")
268+
checkEvictionCompleteMetric("true", "true")
273269
})
274270
})
275271

@@ -348,7 +344,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
348344
})
349345

350346
By("Ensure eviction complete metric was emitted", func() {
351-
checkEvictionCompleteMetric(customRegistry, "true", "true")
347+
checkEvictionCompleteMetric("true", "true")
352348
})
353349
})
354350

@@ -478,7 +474,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
478474
})
479475

480476
By("Ensure eviction complete metric was emitted", func() {
481-
checkEvictionCompleteMetric(customRegistry, "true", "true")
477+
checkEvictionCompleteMetric("true", "true")
482478
})
483479
})
484480

@@ -527,7 +523,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
527523
})
528524

529525
By("Ensure eviction complete metric was emitted", func() {
530-
checkEvictionCompleteMetric(customRegistry, "false", "true")
526+
checkEvictionCompleteMetric("false", "true")
531527
})
532528
})
533529

@@ -623,7 +619,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
623619
})
624620

625621
By("Ensure eviction complete metric was emitted", func() {
626-
checkEvictionCompleteMetric(customRegistry, "true", "true")
622+
checkEvictionCompleteMetric("true", "true")
627623
})
628624
})
629625

@@ -660,7 +656,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
660656
})
661657

662658
By("Ensure eviction complete metric was emitted", func() {
663-
checkEvictionCompleteMetric(customRegistry, "true", "true")
659+
checkEvictionCompleteMetric("true", "true")
664660
})
665661
})
666662

@@ -697,7 +693,7 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
697693
})
698694

699695
By("Ensure eviction complete metric was emitted", func() {
700-
checkEvictionCompleteMetric(customRegistry, "true", "true")
696+
checkEvictionCompleteMetric("true", "true")
701697
})
702698
})
703699
})
@@ -818,8 +814,8 @@ func ensureAllBindingsAreRemoved(crpName string) {
818814
}
819815
}
820816

821-
func checkEvictionCompleteMetric(registry *prometheus.Registry, isValid, isComplete string) {
822-
metricFamilies, err := registry.Gather()
817+
func checkEvictionCompleteMetric(isValid, isComplete string) {
818+
metricFamilies, err := ctrlmetrics.Registry.Gather()
823819
Expect(err).Should(Succeed())
824820
var evictionCompleteMetrics []*prometheusclientmodel.Metric
825821
for _, mf := range metricFamilies {

pkg/controllers/clusterresourceplacementeviction/controller_test.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626

2727
"github.com/google/go-cmp/cmp"
2828
"github.com/google/go-cmp/cmp/cmpopts"
29-
"github.com/prometheus/client_golang/prometheus"
3029
prometheusclientmodel "github.com/prometheus/client_model/go"
3130
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3231
"k8s.io/apimachinery/pkg/runtime"
@@ -36,6 +35,7 @@ import (
3635
controllerruntime "sigs.k8s.io/controller-runtime"
3736
"sigs.k8s.io/controller-runtime/pkg/client"
3837
"sigs.k8s.io/controller-runtime/pkg/client/fake"
38+
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
3939

4040
placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
4141
"github.com/kubefleet-dev/kubefleet/pkg/utils/condition"
@@ -1492,12 +1492,6 @@ func TestReconcileForIncompleteEvictionMetric(t *testing.T) {
14921492
isValid := "unknown"
14931493
isComplete := "false"
14941494

1495-
// Create a test registry
1496-
customRegistry := prometheus.NewRegistry()
1497-
if err := customRegistry.Register(metrics.FleetEvictionStatus); err != nil {
1498-
t.Errorf("Failed to register metric: %v", err)
1499-
}
1500-
15011495
// Reset metrics before each test
15021496
metrics.FleetEvictionStatus.Reset()
15031497

@@ -1513,7 +1507,7 @@ func TestReconcileForIncompleteEvictionMetric(t *testing.T) {
15131507
t.Errorf("reconcile should have failed")
15141508
}
15151509

1516-
metricFamilies, err := customRegistry.Gather()
1510+
metricFamilies, err := ctrlmetrics.Registry.Gather()
15171511
if err != nil {
15181512
t.Errorf("error gathering metrics: %v", err)
15191513
}

pkg/controllers/updaterun/controller_integration_test.go

Lines changed: 87 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,17 @@ package updaterun
1818

1919
import (
2020
"context"
21+
"encoding/json"
2122
"fmt"
2223
"strconv"
2324
"time"
2425

2526
"github.com/google/go-cmp/cmp"
2627
. "github.com/onsi/ginkgo/v2"
2728
. "github.com/onsi/gomega"
28-
"github.com/prometheus/client_golang/prometheus"
2929
prometheusclientmodel "github.com/prometheus/client_model/go"
3030

31+
corev1 "k8s.io/api/core/v1"
3132
rbacv1 "k8s.io/api/rbac/v1"
3233
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
3334
"k8s.io/apimachinery/pkg/api/errors"
@@ -38,6 +39,7 @@ import (
3839
"k8s.io/utils/ptr"
3940
"sigs.k8s.io/controller-runtime/pkg/client"
4041
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
42+
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
4143

4244
clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1"
4345
placementv1alpha1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1alpha1"
@@ -54,13 +56,6 @@ const (
5456
// duration is the time to duration to check for Consistently
5557
duration = time.Second * 20
5658

57-
// numTargetClusters is the number of scheduled clusters
58-
numTargetClusters = 10
59-
// numUnscheduledClusters is the number of unscheduled clusters
60-
numUnscheduledClusters = 3
61-
// numberOfClustersAnnotation is the number of clusters in the test latest policy snapshot
62-
numberOfClustersAnnotation = numTargetClusters
63-
6459
// testResourceSnapshotIndex is the index of the test resource snapshot
6560
testResourceSnapshotIndex = "0"
6661
)
@@ -77,8 +72,6 @@ var (
7772
testUpdateStrategyName string
7873
testCROName string
7974
updateRunNamespacedName types.NamespacedName
80-
testNamespace []byte
81-
customRegistry *prometheus.Registry
8275
)
8376

8477
var _ = Describe("Test the clusterStagedUpdateRun controller", func() {
@@ -90,15 +83,13 @@ var _ = Describe("Test the clusterStagedUpdateRun controller", func() {
9083
testUpdateStrategyName = "updatestrategy-" + utils.RandStr()
9184
testCROName = "cro-" + utils.RandStr()
9285
updateRunNamespacedName = types.NamespacedName{Name: testUpdateRunName}
93-
94-
customRegistry = initializeUpdateRunMetricsRegistry()
9586
})
9687

9788
AfterEach(func() {
9889
By("Checking the update run status metrics are removed")
9990
// No metrics are emitted as all are removed after updateRun is deleted.
100-
validateUpdateRunMetricsEmitted(customRegistry)
101-
unregisterUpdateRunMetrics(customRegistry)
91+
validateUpdateRunMetricsEmitted()
92+
resetUpdateRunMetrics()
10293
})
10394

10495
Context("Test reconciling a clusterStagedUpdateRun", func() {
@@ -242,23 +233,14 @@ var _ = Describe("Test the clusterStagedUpdateRun controller", func() {
242233
})
243234
})
244235

245-
func initializeUpdateRunMetricsRegistry() *prometheus.Registry {
246-
// Create a test registry
247-
customRegistry := prometheus.NewRegistry()
248-
Expect(customRegistry.Register(metrics.FleetUpdateRunStatusLastTimestampSeconds)).Should(Succeed())
249-
// Reset metrics before each test
236+
func resetUpdateRunMetrics() {
250237
metrics.FleetUpdateRunStatusLastTimestampSeconds.Reset()
251-
return customRegistry
252-
}
253-
254-
func unregisterUpdateRunMetrics(registry *prometheus.Registry) {
255-
Expect(registry.Unregister(metrics.FleetUpdateRunStatusLastTimestampSeconds)).Should(BeTrue())
256238
}
257239

258240
// validateUpdateRunMetricsEmitted validates the update run status metrics are emitted and are emitted in the correct order.
259-
func validateUpdateRunMetricsEmitted(registry *prometheus.Registry, wantMetrics ...*prometheusclientmodel.Metric) {
241+
func validateUpdateRunMetricsEmitted(wantMetrics ...*prometheusclientmodel.Metric) {
260242
Eventually(func() error {
261-
metricFamilies, err := registry.Gather()
243+
metricFamilies, err := ctrlmetrics.Registry.Gather()
262244
if err != nil {
263245
return fmt.Errorf("failed to gather metrics: %w", err)
264246
}
@@ -388,7 +370,7 @@ func generateTestClusterResourcePlacement() *placementv1beta1.ClusterResourcePla
388370
}
389371
}
390372

391-
func generateTestClusterSchedulingPolicySnapshot(idx int) *placementv1beta1.ClusterSchedulingPolicySnapshot {
373+
func generateTestClusterSchedulingPolicySnapshot(idx, numberOfClustersAnnotation int) *placementv1beta1.ClusterSchedulingPolicySnapshot {
392374
return &placementv1beta1.ClusterSchedulingPolicySnapshot{
393375
ObjectMeta: metav1.ObjectMeta{
394376
Name: fmt.Sprintf(placementv1beta1.PolicySnapshotNameFmt, testCRPName, idx),
@@ -410,6 +392,51 @@ func generateTestClusterSchedulingPolicySnapshot(idx int) *placementv1beta1.Clus
410392
}
411393
}
412394

395+
func generateTestClusterResourceBindingsAndClusters(policySnapshotIndex int) ([]*placementv1beta1.ClusterResourceBinding, []*clusterv1beta1.MemberCluster, []*clusterv1beta1.MemberCluster) {
396+
numTargetClusters := 10
397+
numUnscheduledClusters := 3
398+
policySnapshotName := fmt.Sprintf(placementv1beta1.PolicySnapshotNameFmt, testCRPName, policySnapshotIndex)
399+
resourceBindings := make([]*placementv1beta1.ClusterResourceBinding, numTargetClusters+numUnscheduledClusters)
400+
targetClusters := make([]*clusterv1beta1.MemberCluster, numTargetClusters)
401+
for i := range targetClusters {
402+
// split the clusters into 2 regions
403+
region := regionEastus
404+
if i%2 == 0 {
405+
region = regionWestus
406+
}
407+
// reserse the order of the clusters by index
408+
targetClusters[i] = generateTestMemberCluster(numTargetClusters-1-i, "cluster-"+strconv.Itoa(i), map[string]string{"group": "prod", "region": region})
409+
resourceBindings[i] = generateTestClusterResourceBinding(policySnapshotName, targetClusters[i].Name, placementv1beta1.BindingStateScheduled)
410+
}
411+
412+
unscheduledClusters := make([]*clusterv1beta1.MemberCluster, numUnscheduledClusters)
413+
for i := range unscheduledClusters {
414+
unscheduledClusters[i] = generateTestMemberCluster(i, "unscheduled-cluster-"+strconv.Itoa(i), map[string]string{"group": "staging"})
415+
// update the policySnapshot name so that these clusters are considered to-be-deleted
416+
resourceBindings[numTargetClusters+i] = generateTestClusterResourceBinding(policySnapshotName+"a", unscheduledClusters[i].Name, placementv1beta1.BindingStateUnscheduled)
417+
}
418+
return resourceBindings, targetClusters, unscheduledClusters
419+
}
420+
421+
func generateSmallTestClusterResourceBindingsAndClusters(policySnapshotIndex int) ([]*placementv1beta1.ClusterResourceBinding, []*clusterv1beta1.MemberCluster, []*clusterv1beta1.MemberCluster) {
422+
numTargetClusters := 3
423+
policySnapshotName := fmt.Sprintf(placementv1beta1.PolicySnapshotNameFmt, testCRPName, policySnapshotIndex)
424+
resourceBindings := make([]*placementv1beta1.ClusterResourceBinding, numTargetClusters)
425+
targetClusters := make([]*clusterv1beta1.MemberCluster, numTargetClusters)
426+
for i := range targetClusters {
427+
// split the clusters into 2 regions
428+
region := regionEastus
429+
if i%2 == 0 {
430+
region = regionWestus
431+
}
432+
// reserse the order of the clusters by index
433+
targetClusters[i] = generateTestMemberCluster(numTargetClusters-1-i, "cluster-"+strconv.Itoa(i), map[string]string{"group": "prod", "region": region})
434+
resourceBindings[i] = generateTestClusterResourceBinding(policySnapshotName, targetClusters[i].Name, placementv1beta1.BindingStateScheduled)
435+
}
436+
unscheduledClusters := make([]*clusterv1beta1.MemberCluster, 0)
437+
return resourceBindings, targetClusters, unscheduledClusters
438+
}
439+
413440
func generateTestClusterResourceBinding(policySnapshotName, targetCluster string, state placementv1beta1.BindingState) *placementv1beta1.ClusterResourceBinding {
414441
binding := &placementv1beta1.ClusterResourceBinding{
415442
ObjectMeta: metav1.ObjectMeta{
@@ -504,7 +531,36 @@ func generateTestClusterStagedUpdateStrategy() *placementv1beta1.ClusterStagedUp
504531
}
505532
}
506533

534+
func generateTestClusterStagedUpdateStrategyWithSingleStage(afterStageTasks []placementv1beta1.AfterStageTask) *placementv1beta1.ClusterStagedUpdateStrategy {
535+
return &placementv1beta1.ClusterStagedUpdateStrategy{
536+
ObjectMeta: metav1.ObjectMeta{
537+
Name: testUpdateStrategyName,
538+
},
539+
Spec: placementv1beta1.StagedUpdateStrategySpec{
540+
Stages: []placementv1beta1.StageConfig{
541+
{
542+
Name: "stage1",
543+
LabelSelector: &metav1.LabelSelector{}, // Select all clusters.
544+
AfterStageTasks: afterStageTasks,
545+
},
546+
},
547+
},
548+
}
549+
}
550+
507551
func generateTestClusterResourceSnapshot() *placementv1beta1.ClusterResourceSnapshot {
552+
testNamespace, _ := json.Marshal(corev1.Namespace{
553+
TypeMeta: metav1.TypeMeta{
554+
APIVersion: "v1",
555+
Kind: "Namespace",
556+
},
557+
ObjectMeta: metav1.ObjectMeta{
558+
Name: "test-namespace",
559+
Labels: map[string]string{
560+
"fleet.azure.com/name": "test-namespace",
561+
},
562+
},
563+
})
508564
clusterResourceSnapshot := &placementv1beta1.ClusterResourceSnapshot{
509565
ObjectMeta: metav1.ObjectMeta{
510566
Name: testResourceSnapshotName,
@@ -671,6 +727,8 @@ func generateTrueCondition(obj client.Object, condType any) metav1.Condition {
671727
switch cond {
672728
case placementv1beta1.ResourceBindingAvailable:
673729
reason = condition.AvailableReason
730+
case placementv1beta1.ResourceBindingDiffReported:
731+
reason = condition.DiffReportedStatusTrueReason
674732
}
675733
typeStr = string(cond)
676734
}
@@ -713,6 +771,8 @@ func generateFalseCondition(obj client.Object, condType any) metav1.Condition {
713771
switch cond {
714772
case placementv1beta1.ResourceBindingApplied:
715773
reason = condition.ApplyFailedReason
774+
case placementv1beta1.ResourceBindingDiffReported:
775+
reason = condition.DiffReportedStatusFalseReason
716776
}
717777
typeStr = string(cond)
718778
}

pkg/controllers/updaterun/execution.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -460,13 +460,15 @@ func checkClusterUpdateResult(
460460
updateRun *placementv1beta1.ClusterStagedUpdateRun,
461461
) (bool, error) {
462462
availCond := binding.GetCondition(string(placementv1beta1.ResourceBindingAvailable))
463-
if condition.IsConditionStatusTrue(availCond, binding.Generation) {
464-
// The resource updated on the cluster is available.
463+
diffReportCondition := binding.GetCondition(string(placementv1beta1.ResourceBindingDiffReported))
464+
if condition.IsConditionStatusTrue(availCond, binding.Generation) ||
465+
condition.IsConditionStatusTrue(diffReportCondition, binding.Generation) {
466+
// The resource updated on the cluster is available or diff is successfully reported.
465467
klog.InfoS("The cluster has been updated", "cluster", clusterStatus.ClusterName, "stage", updatingStage.StageName, "clusterStagedUpdateRun", klog.KObj(updateRun))
466468
markClusterUpdatingSucceeded(clusterStatus, updateRun.Generation)
467469
return true, nil
468470
}
469-
if bindingutils.HasBindingFailed(binding) {
471+
if bindingutils.HasBindingFailed(binding) || condition.IsConditionStatusFalse(diffReportCondition, binding.Generation) {
470472
// We have no way to know if the failed condition is recoverable or not so we just let it run
471473
klog.InfoS("The cluster updating encountered an error", "cluster", clusterStatus.ClusterName, "stage", updatingStage.StageName, "clusterStagedUpdateRun", klog.KObj(updateRun))
472474
// TODO(wantjian): identify some non-recoverable error and mark the cluster updating as failed

0 commit comments

Comments
 (0)