Skip to content

Commit 025f974

Browse files
authored
Fix the unified image setup during bootstrap when a custom environment variable is used for the zone (#2156)
1 parent e426237 commit 025f974

File tree

8 files changed

+164
-21
lines changed

8 files changed

+164
-21
lines changed

controllers/generate_initial_cluster_file.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func (g generateInitialClusterFile) reconcile(ctx context.Context, r *Foundation
4747
}
4848

4949
logger.Info("Generating initial cluster file")
50-
r.Recorder.Event(cluster, corev1.EventTypeNormal, "ChangingCoordinators", "Choosing initial coordinators")
50+
r.Recorder.Event(cluster, corev1.EventTypeNormal, "GenerateInitialCoordinators", "Choosing initial coordinators")
5151

5252
processCounts, err := cluster.GetProcessCountsWithDefaults()
5353
if err != nil {
@@ -130,8 +130,7 @@ func (g generateInitialClusterFile) reconcile(ctx context.Context, r *Foundation
130130
}
131131

132132
coordinators, err := locality.ChooseDistributedProcesses(cluster, processLocality, count, locality.ProcessSelectionConstraint{
133-
HardLimits: locality.GetHardLimits(cluster),
134-
SelectingCoordinators: true,
133+
HardLimits: locality.GetHardLimits(cluster),
135134
})
136135
if err != nil {
137136
return &requeue{curError: err}

e2e/fixtures/cluster_config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ type ClusterConfig struct {
7878
UseDNS bool
7979
// If enabled the cluster will be setup with the unified image.
8080
UseUnifiedImage *bool
81+
// SimulateCustomFaultDomainEnv will simulate the use case that a user has set a custom environment variable to
82+
// be used as zone ID.
83+
SimulateCustomFaultDomainEnv bool
8184
// CreationTracker if specified will be used to log the time between the creations steps.
8285
CreationTracker CreationTrackerLogger
8386
// Number of machines, this is used for calculating the number of Pods and is not correlated to the actual number

e2e/fixtures/fdb_cluster_specs.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ func (factory *Factory) createFDBClusterSpec(
4848
imageType = fdbv1beta2.ImageTypeUnified
4949
}
5050

51+
faultDomain := fdbv1beta2.FoundationDBClusterFaultDomain{
52+
Key: "foundationdb.org/none",
53+
}
54+
if config.SimulateCustomFaultDomainEnv {
55+
faultDomain.ValueFrom = "$" + fdbv1beta2.EnvNameInstanceID
56+
faultDomain.Key = corev1.LabelHostname
57+
}
58+
5159
return &fdbv1beta2.FoundationDBCluster{
5260
ObjectMeta: metav1.ObjectMeta{
5361
Name: config.Name,
@@ -64,9 +72,7 @@ func (factory *Factory) createFDBClusterSpec(
6472
MainContainer: factory.GetMainContainerOverrides(config.DebugSymbols, useUnifiedImage),
6573
ImageType: &imageType,
6674
SidecarContainer: factory.GetSidecarContainerOverrides(config.DebugSymbols),
67-
FaultDomain: fdbv1beta2.FoundationDBClusterFaultDomain{
68-
Key: "foundationdb.org/none",
69-
},
75+
FaultDomain: faultDomain,
7076
AutomationOptions: fdbv1beta2.FoundationDBClusterAutomationOptions{
7177
// We have to wait long enough to ensure the operator is not recreating too many Pods at the same time.
7278
WaitBetweenRemovalsSeconds: pointer.Int(0),

e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ import (
3232
"log"
3333
"time"
3434

35+
corev1 "k8s.io/api/core/v1"
36+
3537
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
3638
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
3739
. "github.com/onsi/ginkgo/v2"
@@ -119,7 +121,7 @@ func performUpgrade(config testConfig, preUpgradeFunction func(cluster *fixtures
119121
for _, processGroup := range cluster.Status.ProcessGroups {
120122
missingTime := processGroup.GetConditionTime(fdbv1beta2.MissingProcesses)
121123
// If the Pod is missing check if the fdbserver processes are running and check the logs of the fdb-kubernetes-monitor.
122-
if missingTime != nil && time.Since(time.Unix(*missingTime, 0)) > 60*time.Second {
124+
if missingTime != nil && time.Since(time.Unix(*missingTime, 0)) > 120*time.Second && !processGroup.IsMarkedForRemoval() && !processGroup.IsExcluded() {
123125
log.Println("Missing process for:", processGroup.ProcessGroupID)
124126
stdout, stderr, err := factory.ExecuteCmd(context.Background(), cluster.Namespace, processGroup.GetPodName(cluster), fdbv1beta2.MainContainerName, "ps aufx", true)
125127
log.Println("stdout:", stdout, "stderr", stderr, "err", err)
@@ -336,4 +338,26 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() {
336338
EntryDescription("Upgrade from %[1]s to %[2]s"),
337339
fixtures.GenerateUpgradeTableEntries(testOptions),
338340
)
341+
342+
DescribeTable(
343+
"upgrading a cluster with zone ID from a custom environment variable",
344+
func(beforeVersion string, targetVersion string) {
345+
performUpgrade(testConfig{
346+
beforeVersion: beforeVersion,
347+
targetVersion: targetVersion,
348+
clusterConfig: &fixtures.ClusterConfig{
349+
DebugSymbols: false,
350+
SimulateCustomFaultDomainEnv: true,
351+
},
352+
loadData: false,
353+
}, func(cluster *fixtures.FdbCluster) {
354+
spec := cluster.GetCluster().Spec.DeepCopy()
355+
356+
Expect(spec.FaultDomain.Key).To(Equal(corev1.LabelHostname))
357+
Expect(spec.FaultDomain.ValueFrom).To(HaveSuffix(fdbv1beta2.EnvNameInstanceID))
358+
})
359+
},
360+
EntryDescription("Upgrade from %[1]s to %[2]s"),
361+
fixtures.GenerateUpgradeTableEntries(testOptions),
362+
)
339363
})

internal/coordinator/coordinator.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,7 @@ func selectCoordinatorsLocalities(logger logr.Logger, cluster *fdbv1beta2.Founda
115115
}
116116

117117
coordinators, err := locality.ChooseDistributedProcesses(cluster, candidates, coordinatorCount, locality.ProcessSelectionConstraint{
118-
HardLimits: locality.GetHardLimits(cluster),
119-
SelectingCoordinators: true,
118+
HardLimits: locality.GetHardLimits(cluster),
120119
})
121120

122121
logger.Info("Current coordinators", "coordinators", coordinators, "error", err)

internal/locality/locality.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/go-logr/logr"
3030
"math"
3131
"slices"
32+
"strings"
3233
)
3334

3435
// Info captures information about a process for the purposes of
@@ -114,6 +115,7 @@ func InfoForProcess(process fdbv1beta2.FoundationDBStatusProcessInfo, mainContai
114115

115116
// InfoFromSidecar converts the process information from the sidecar's
116117
// context into locality info for selecting processes.
118+
// This method is only used during the initial bootstrapping of the cluster when no fdbserver processes are running.
117119
func InfoFromSidecar(cluster *fdbv1beta2.FoundationDBCluster, client podclient.FdbPodClient) (Info, error) {
118120
substitutions, err := client.GetVariableSubstitutions()
119121
if err != nil {
@@ -124,14 +126,33 @@ func InfoFromSidecar(cluster *fdbv1beta2.FoundationDBCluster, client podclient.F
124126
return Info{}, nil
125127
}
126128

129+
// Take the zone ID from the FDB_ZONE_ID if present.
130+
zoneID, present := substitutions[fdbv1beta2.EnvNameZoneID]
131+
if !present {
132+
// If the FDB_ZONE_ID is not present, the user specified another environment variable that represents the
133+
// zone ID.
134+
var zoneVariable string
135+
if strings.HasPrefix(cluster.Spec.FaultDomain.ValueFrom, "$") {
136+
zoneVariable = cluster.Spec.FaultDomain.ValueFrom[1:]
137+
} else {
138+
zoneVariable = fdbv1beta2.EnvNameZoneID
139+
}
140+
141+
zoneID = substitutions[zoneVariable]
142+
}
143+
144+
if zoneID == "" {
145+
return Info{}, errors.New("no zone ID found in Sidecar information")
146+
}
147+
127148
// This locality information is only used during the initial cluster file generation.
128149
// So it should be good to only use the first process address here.
129150
// This has the implication that in the initial cluster file only the first processes will be used.
130151
return Info{
131152
ID: substitutions[fdbv1beta2.EnvNameInstanceID],
132153
Address: cluster.GetFullAddress(substitutions[fdbv1beta2.EnvNamePublicIP], 1),
133154
LocalityData: map[string]string{
134-
fdbv1beta2.FDBLocalityZoneIDKey: substitutions[fdbv1beta2.EnvNameZoneID],
155+
fdbv1beta2.FDBLocalityZoneIDKey: zoneID,
135156
fdbv1beta2.FDBLocalityDNSNameKey: substitutions[fdbv1beta2.EnvNameDNSName],
136157
},
137158
}, nil
@@ -164,9 +185,6 @@ type ProcessSelectionConstraint struct {
164185
// HardLimits defines a maximum number of processes to recruit on any single
165186
// value for a given locality field.
166187
HardLimits map[string]int
167-
168-
// SelectingCoordinators must be true when the ChooseDistributedProcesses is used to select coordinators.
169-
SelectingCoordinators bool
170188
}
171189

172190
// ChooseDistributedProcesses recruits a maximally well-distributed set of processes from a set of potential candidates.

internal/locality/locality_test.go

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -603,8 +603,7 @@ var _ = Describe("Localities", func() {
603603
BeforeEach(func() {
604604
candidates = generateCandidates(dcIDs, 5, 5)
605605
result, err = ChooseDistributedProcesses(cluster, candidates, cluster.DesiredCoordinatorCount(), ProcessSelectionConstraint{
606-
HardLimits: GetHardLimits(cluster),
607-
SelectingCoordinators: true,
606+
HardLimits: GetHardLimits(cluster),
608607
})
609608
Expect(err).NotTo(HaveOccurred())
610609
})
@@ -645,8 +644,7 @@ var _ = Describe("Localities", func() {
645644
// Only measure the actual execution of ChooseDistributedProcesses.
646645
experiment.MeasureDuration("ChooseDistributedProcesses", func() {
647646
_, _ = ChooseDistributedProcesses(cluster, candidates, cluster.DesiredCoordinatorCount(), ProcessSelectionConstraint{
648-
HardLimits: GetHardLimits(cluster),
649-
SelectingCoordinators: true,
647+
HardLimits: GetHardLimits(cluster),
650648
})
651649
})
652650
// We'll sample the function up to 50 times or up to a minute, whichever comes first.
@@ -754,8 +752,7 @@ var _ = Describe("Localities", func() {
754752
}
755753

756754
result, err = ChooseDistributedProcesses(cluster, candidates, cluster.DesiredCoordinatorCount(), ProcessSelectionConstraint{
757-
HardLimits: GetHardLimits(cluster),
758-
SelectingCoordinators: true,
755+
HardLimits: GetHardLimits(cluster),
759756
})
760757
Expect(err).NotTo(HaveOccurred())
761758
})
@@ -1055,6 +1052,104 @@ var _ = Describe("Localities", func() {
10551052
Info{},
10561053
true,
10571054
),
1055+
Entry("locality information is read from a different environment variables",
1056+
&fdbv1beta2.FoundationDBCluster{
1057+
Spec: fdbv1beta2.FoundationDBClusterSpec{
1058+
FaultDomain: fdbv1beta2.FoundationDBClusterFaultDomain{
1059+
Key: corev1.LabelHostname,
1060+
ValueFrom: "$CUSTOM_ENV",
1061+
},
1062+
},
1063+
Status: fdbv1beta2.FoundationDBClusterStatus{
1064+
RequiredAddresses: fdbv1beta2.RequiredAddressSet{
1065+
NonTLS: true,
1066+
},
1067+
},
1068+
},
1069+
&corev1.Pod{
1070+
ObjectMeta: metav1.ObjectMeta{
1071+
Name: "test",
1072+
Labels: map[string]string{
1073+
fdbv1beta2.FDBProcessGroupIDLabel: "test",
1074+
},
1075+
},
1076+
Spec: corev1.PodSpec{
1077+
Containers: []corev1.Container{
1078+
{
1079+
Name: "foundationdb-kubernetes-sidecar",
1080+
Args: []string{
1081+
"--public-ip-family",
1082+
"4",
1083+
},
1084+
Env: []corev1.EnvVar{
1085+
{
1086+
Name: "CUSTOM_ENV",
1087+
Value: "custom-zone-id",
1088+
},
1089+
},
1090+
},
1091+
},
1092+
},
1093+
Status: corev1.PodStatus{
1094+
PodIPs: []corev1.PodIP{
1095+
{IP: "1.1.1.1"},
1096+
},
1097+
},
1098+
},
1099+
Info{
1100+
ID: "test",
1101+
Address: fdbv1beta2.ProcessAddress{
1102+
IPAddress: net.ParseIP("1.1.1.1"),
1103+
Port: 4501,
1104+
},
1105+
LocalityData: map[string]string{
1106+
fdbv1beta2.FDBLocalityZoneIDKey: "custom-zone-id",
1107+
fdbv1beta2.FDBLocalityDNSNameKey: "",
1108+
},
1109+
},
1110+
false,
1111+
),
1112+
Entry("locality information is read from a different environment variable which is missing",
1113+
&fdbv1beta2.FoundationDBCluster{
1114+
Spec: fdbv1beta2.FoundationDBClusterSpec{
1115+
FaultDomain: fdbv1beta2.FoundationDBClusterFaultDomain{
1116+
Key: corev1.LabelHostname,
1117+
ValueFrom: "$CUSTOM_ENV",
1118+
},
1119+
},
1120+
Status: fdbv1beta2.FoundationDBClusterStatus{
1121+
RequiredAddresses: fdbv1beta2.RequiredAddressSet{
1122+
NonTLS: true,
1123+
},
1124+
},
1125+
},
1126+
&corev1.Pod{
1127+
ObjectMeta: metav1.ObjectMeta{
1128+
Name: "test",
1129+
Labels: map[string]string{
1130+
fdbv1beta2.FDBProcessGroupIDLabel: "test",
1131+
},
1132+
},
1133+
Spec: corev1.PodSpec{
1134+
Containers: []corev1.Container{
1135+
{
1136+
Name: "foundationdb-kubernetes-sidecar",
1137+
Args: []string{
1138+
"--public-ip-family",
1139+
"4",
1140+
},
1141+
},
1142+
},
1143+
},
1144+
Status: corev1.PodStatus{
1145+
PodIPs: []corev1.PodIP{
1146+
{IP: "1.1.1.1"},
1147+
},
1148+
},
1149+
},
1150+
Info{},
1151+
true,
1152+
),
10581153
)
10591154

10601155
Describe("checkCoordinatorValidity", func() {

internal/pod_client.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,6 @@ func GetSubstitutionsFromClusterAndPod(logger logr.Logger, cluster *fdbv1beta2.F
434434

435435
if faultDomainSource == "spec.nodeName" {
436436
substitutions[fdbv1beta2.EnvNameZoneID] = pod.Spec.NodeName
437-
} else {
438-
return nil, fmt.Errorf("unsupported fault domain source %s", faultDomainSource)
439437
}
440438
}
441439

@@ -450,6 +448,7 @@ func GetSubstitutionsFromClusterAndPod(logger logr.Logger, cluster *fdbv1beta2.F
450448
copyableSubstitutions := map[string]fdbv1beta2.None{
451449
fdbv1beta2.EnvNameDNSName: {},
452450
fdbv1beta2.EnvNameInstanceID: {},
451+
"CUSTOM_ENV": {},
453452
}
454453
for _, container := range pod.Spec.Containers {
455454
for _, envVar := range container.Env {

0 commit comments

Comments
 (0)