Skip to content

Commit 7635f80

Browse files
authored
Correct the reconciliation status after the initial run after the cluster wide bounce (#2341)
* Correct the reconciliation status after the initial run after the cluster wide bounce
1 parent 09e19a1 commit 7635f80

File tree

5 files changed

+138
-37
lines changed

5 files changed

+138
-37
lines changed

api/v1beta2/foundationdbcluster_types.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1805,6 +1805,7 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
18051805
"reconciledProcessGroups",
18061806
cluster.Status.ReconciledProcessGroups,
18071807
)
1808+
reconciled = false
18081809
}
18091810

18101811
if !cluster.Status.Health.Available {
@@ -1903,6 +1904,19 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
19031904
}
19041905
}
19051906

1907+
// If the cluster is currently in the process to be upgraded, don't set the reconciled version.
1908+
if cluster.IsBeingUpgraded() {
1909+
logger.Info(
1910+
"Pending cluster upgrade",
1911+
"runningVersion",
1912+
cluster.GetRunningVersion(),
1913+
"desiredVersion",
1914+
cluster.Spec.Version,
1915+
)
1916+
1917+
reconciled = false
1918+
}
1919+
19061920
if reconciled && cluster.Status.Generations.Reconciled != cluster.Generation {
19071921
logger.Info(
19081922
"Update reconciled generation",

controllers/update_sidecar_versions.go

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,11 @@ package controllers
2222

2323
import (
2424
"context"
25+
"errors"
2526
"fmt"
2627

2728
"github.com/go-logr/logr"
2829

29-
"github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/podmanager"
30-
3130
"github.com/FoundationDB/fdb-kubernetes-operator/v2/internal"
3231

3332
corev1 "k8s.io/api/core/v1"
@@ -53,6 +52,7 @@ func (updateSidecarVersions) reconcile(
5352
}
5453

5554
var upgraded int
55+
var errs []error
5656
for _, processGroup := range cluster.Status.ProcessGroups {
5757
if processGroup.GetConditionTime(fdbv1beta2.ResourcesTerminating) != nil {
5858
logger.V(1).Info("Ignore process group that is stuck terminating",
@@ -74,37 +74,13 @@ func (updateSidecarVersions) reconcile(
7474
continue
7575
}
7676

77-
image, err := internal.GetSidecarImage(cluster, processGroup.ProcessClass)
77+
err = updateSidecarImage(ctx, r, logger, cluster, processGroup, pod)
7878
if err != nil {
79-
return &requeue{curError: err}
79+
errs = append(errs, err)
80+
continue
8081
}
8182

82-
for containerIndex, container := range pod.Spec.Containers {
83-
if container.Name == fdbv1beta2.SidecarContainerName && container.Image != image {
84-
logger.Info(
85-
"Upgrading sidecar",
86-
"processGroupID",
87-
podmanager.GetProcessGroupID(cluster, pod),
88-
"oldImage",
89-
container.Image,
90-
"newImage",
91-
image,
92-
)
93-
err = r.PodLifecycleManager.UpdateImageVersion(
94-
logr.NewContext(ctx, logger),
95-
r,
96-
cluster,
97-
pod,
98-
containerIndex,
99-
image,
100-
)
101-
if err != nil {
102-
return &requeue{curError: err}
103-
}
104-
upgraded++
105-
break
106-
}
107-
}
83+
upgraded++
10884
}
10985

11086
if upgraded > 0 {
@@ -120,5 +96,54 @@ func (updateSidecarVersions) reconcile(
12096
)
12197
}
12298

99+
if len(errs) > 0 {
100+
return &requeue{
101+
curError: errors.Join(errs...),
102+
}
103+
}
104+
105+
return nil
106+
}
107+
108+
func updateSidecarImage(
109+
ctx context.Context,
110+
r *FoundationDBClusterReconciler,
111+
logger logr.Logger,
112+
cluster *fdbv1beta2.FoundationDBCluster,
113+
processGroup *fdbv1beta2.ProcessGroupStatus,
114+
pod *corev1.Pod,
115+
) error {
116+
image, err := internal.GetSidecarImage(cluster, processGroup.ProcessClass)
117+
if err != nil {
118+
return err
119+
}
120+
121+
for containerIndex, container := range pod.Spec.Containers {
122+
if container.Name == fdbv1beta2.SidecarContainerName && container.Image != image {
123+
logger.Info(
124+
"Upgrading sidecar",
125+
"processGroupID",
126+
processGroup.ProcessGroupID,
127+
"oldImage",
128+
container.Image,
129+
"newImage",
130+
image,
131+
)
132+
err = r.PodLifecycleManager.UpdateImageVersion(
133+
logr.NewContext(ctx, logger),
134+
r,
135+
cluster,
136+
pod,
137+
containerIndex,
138+
image,
139+
)
140+
if err != nil {
141+
return err
142+
}
143+
144+
break
145+
}
146+
}
147+
123148
return nil
124149
}

controllers/update_status.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func (c updateStatus) reconcile(
9898
}
9999

100100
// Update the running version based on the reported version of the FDB processes
101-
version, err := getRunningVersion(logger, versionMap, cluster.Status.RunningVersion)
101+
version, err := getRunningVersion(logger, versionMap, cluster.GetRunningVersion())
102102
if err != nil {
103103
return &requeue{
104104
curError: fmt.Errorf(
@@ -108,6 +108,14 @@ func (c updateStatus) reconcile(
108108
}
109109
}
110110
clusterStatus.RunningVersion = version
111+
// We update the running version of the cluster status here to make sure that the Pod Spec hash and other things
112+
// are properly calculated based on the current observed running version. If we don't update the running version
113+
// here, we could see a race condition where the updateStatus sub-reconciler runs for the first time after an upgrade
114+
// bounce. In this case the running version of the new cluster status (clusterStatus) would be updated but the
115+
// status on the cluster struct would be the same. The result of this would be that the operator things the cluster
116+
// is reconciled and in the next updateStatus run, it would observe all the required changes after the cluster
117+
// wide bounce.
118+
cluster.Status.RunningVersion = version
111119

112120
clusterStatus.HasListenIPsForAllPods = cluster.NeedsExplicitListenAddress()
113121
// Update the configuration if the database is available, otherwise the machine-readable status will contain no information

controllers/update_status_test.go

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,7 +1266,6 @@ var _ = Describe("update_status", func() {
12661266
}
12671267
})
12681268
})
1269-
12701269
})
12711270
})
12721271

@@ -1277,8 +1276,7 @@ var _ = Describe("update_status", func() {
12771276

12781277
BeforeEach(func() {
12791278
cluster = internal.CreateDefaultCluster()
1280-
err = k8sClient.Create(context.TODO(), cluster)
1281-
Expect(err).NotTo(HaveOccurred())
1279+
Expect(k8sClient.Create(context.TODO(), cluster)).To(Succeed())
12821280

12831281
result, err := reconcileCluster(cluster)
12841282
Expect(err).NotTo(HaveOccurred())
@@ -1290,8 +1288,9 @@ var _ = Describe("update_status", func() {
12901288
})
12911289

12921290
JustBeforeEach(func() {
1293-
err = internal.NormalizeClusterSpec(cluster, internal.DeprecationOptions{})
1294-
Expect(err).NotTo(HaveOccurred())
1291+
Expect(
1292+
internal.NormalizeClusterSpec(cluster, internal.DeprecationOptions{}),
1293+
).To(Succeed())
12951294
requeue = updateStatus{}.reconcile(
12961295
context.TODO(),
12971296
clusterReconciler,
@@ -1351,6 +1350,61 @@ var _ = Describe("update_status", func() {
13511350
}
13521351
})
13531352
})
1353+
1354+
When("the cluster was bounced for an upgrade", func() {
1355+
BeforeEach(func() {
1356+
version, err := fdbv1beta2.ParseFdbVersion(cluster.Spec.Version)
1357+
Expect(err).NotTo(HaveOccurred())
1358+
cluster.Spec.Version = version.NextMajorVersion().String()
1359+
Expect(k8sClient.Update(context.Background(), cluster)).To(Succeed())
1360+
1361+
adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient)
1362+
Expect(err).NotTo(HaveOccurred())
1363+
1364+
Expect(
1365+
internal.NormalizeClusterSpec(cluster, internal.DeprecationOptions{}),
1366+
).To(Succeed())
1367+
// Prepare the sidecar versions and make sure that we bounce the cluster.
1368+
var addresses []fdbv1beta2.ProcessAddress
1369+
versions := make(map[fdbv1beta2.ProcessGroupID]string)
1370+
for _, processGroup := range cluster.Status.ProcessGroups {
1371+
for _, addr := range processGroup.Addresses {
1372+
parsedAddr, err := fdbv1beta2.ParseProcessAddress(addr)
1373+
Expect(err).NotTo(HaveOccurred())
1374+
addresses = append(addresses, parsedAddr)
1375+
}
1376+
versions[processGroup.ProcessGroupID] = cluster.Spec.Version
1377+
1378+
pod, err := internal.GetPod(cluster, processGroup)
1379+
Expect(err).NotTo(HaveOccurred())
1380+
Expect(
1381+
updateSidecarImage(
1382+
context.Background(),
1383+
clusterReconciler,
1384+
logger,
1385+
cluster,
1386+
processGroup,
1387+
pod,
1388+
),
1389+
).To(Succeed())
1390+
}
1391+
1392+
Expect(adminClient.KillProcesses(addresses)).NotTo(HaveOccurred())
1393+
adminClient.VersionProcessGroups = versions
1394+
1395+
_, err = reloadCluster(cluster)
1396+
Expect(err).NotTo(HaveOccurred())
1397+
})
1398+
1399+
It("should update the incorrect sidecar image condition", func() {
1400+
Expect(requeue).NotTo(BeNil())
1401+
Expect(cluster.Generation).NotTo(Equal(cluster.Status.Generations.Reconciled))
1402+
1403+
for _, processGroup := range cluster.Status.ProcessGroups {
1404+
Expect(processGroup.ProcessGroupConditions).NotTo(HaveLen(0))
1405+
}
1406+
})
1407+
})
13541408
})
13551409

13561410
DescribeTable(

e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ func performUpgrade(config testConfig, preUpgradeFunction func(cluster *fixtures
192192
// replacements during an upgrade.
193193
expectedProcessCounts := (processCounts.Total()-processCounts.Storage)*2 + 5
194194
Expect(len(transactionSystemProcessGroups)).To(BeNumerically("<=", expectedProcessCounts))
195-
// Ensure we have not data loss.
195+
// Ensure we have no data loss.
196196
fdbCluster.EnsureTeamTrackersAreHealthy()
197197
fdbCluster.EnsureTeamTrackersHaveMinReplicas()
198198
}

0 commit comments

Comments
 (0)