Skip to content

Commit c54236a

Browse files
improve etcd management in CAPIM
1 parent 6073e52 commit c54236a

File tree

5 files changed

+242
-30
lines changed

5 files changed

+242
-30
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
Copyright 2023 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1alpha1
18+
19+
// defines annotations to be applied to in memory etcd pods in order to track etcd cluster
20+
// info belonging to the etcd member each pod represent.
21+
const (
22+
// EtcdClusterIDAnnotationName defines the name of the annotation applied to in memory etcd
23+
// pods to track the cluster ID of the etcd member each pod represent.
24+
EtcdClusterIDAnnotationName = "etcd.inmemory.infrastructure.cluster.x-k8s.io/cluster-id"
25+
26+
// EtcdMemberIDAnnotationName defines the name of the annotation applied to in memory etcd
27+
// pods to track the member ID of the etcd member each pod represent.
28+
EtcdMemberIDAnnotationName = "etcd.inmemory.infrastructure.cluster.x-k8s.io/member-id"
29+
30+
// EtcdLeaderFromAnnotationName defines the name of the annotation applied to in memory etcd
31+
// pods to track leadership status of the etcd member each pod represent.
32+
// Note: We are tracking the time from an etcd member is leader; if more than one pod has this
33+
// annotation, the last etcd member that became leader is the current leader.
34+
// By using this mechanism leadership can be forwarded to another pod with an atomic operation
35+
// (add/update of the annotation to the pod/etcd member we are forwarding leadership to).
36+
EtcdLeaderFromAnnotationName = "etcd.inmemory.infrastructure.cluster.x-k8s.io/leader-from"
37+
)

test/infrastructure/inmemory/internal/controllers/inmemorymachine_controller.go

Lines changed: 90 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
apierrors "k8s.io/apimachinery/pkg/api/errors"
3232
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3333
kerrors "k8s.io/apimachinery/pkg/util/errors"
34+
"k8s.io/apimachinery/pkg/util/sets"
3435
"k8s.io/klog/v2"
3536
"k8s.io/utils/pointer"
3637
ctrl "sigs.k8s.io/controller-runtime"
@@ -44,6 +45,7 @@ import (
4445
infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/inmemory/api/v1alpha1"
4546
"sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/cloud"
4647
cloudv1 "sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/cloud/api/v1alpha1"
48+
cclient "sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/cloud/runtime/client"
4749
"sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/server"
4850
"sigs.k8s.io/cluster-api/util"
4951
"sigs.k8s.io/cluster-api/util/annotations"
@@ -421,13 +423,6 @@ func (r *InMemoryMachineReconciler) reconcileNormalETCD(ctx context.Context, clu
421423
"component": "etcd",
422424
"tier": "control-plane",
423425
},
424-
Annotations: map[string]string{
425-
// TODO: read this from existing etcd pods, if any, otherwise all the member will get a different ClusterID.
426-
"etcd.inmemory.infrastructure.cluster.x-k8s.io/cluster-id": fmt.Sprintf("%d", rand.Uint32()), //nolint:gosec // weak random number generator is good enough here
427-
"etcd.inmemory.infrastructure.cluster.x-k8s.io/member-id": fmt.Sprintf("%d", rand.Uint32()), //nolint:gosec // weak random number generator is good enough here
428-
// TODO: set this only if there are no other leaders.
429-
"etcd.inmemory.infrastructure.cluster.x-k8s.io/leader-from": time.Now().Format(time.RFC3339),
430-
},
431426
},
432427
Status: corev1.PodStatus{
433428
Phase: corev1.PodRunning,
@@ -444,6 +439,42 @@ func (r *InMemoryMachineReconciler) reconcileNormalETCD(ctx context.Context, clu
444439
return ctrl.Result{}, errors.Wrapf(err, "failed to get etcd Pod")
445440
}
446441

442+
// Gets info about the current etcd cluster, if any.
443+
info, err := r.inspectEtcd(ctx, cloudClient)
444+
if err != nil {
445+
return ctrl.Result{}, err
446+
}
447+
448+
// If this is the first etcd member in the cluster, assign a cluster ID
449+
if info.clusterID == "" {
450+
for {
451+
info.clusterID = fmt.Sprintf("%d", rand.Uint32()) //nolint:gosec // weak random number generator is good enough here
452+
if info.clusterID != "0" {
453+
break
454+
}
455+
}
456+
}
457+
458+
// Computes a unique memberID.
459+
var memberID string
460+
for {
461+
memberID = fmt.Sprintf("%d", rand.Uint32()) //nolint:gosec // weak random number generator is good enough here
462+
if !info.members.Has(memberID) && memberID != "0" {
463+
break
464+
}
465+
}
466+
467+
// Annotate the pod with the info about the etcd cluster.
468+
etcdPod.Annotations = map[string]string{
469+
cloudv1.EtcdClusterIDAnnotationName: info.clusterID,
470+
cloudv1.EtcdMemberIDAnnotationName: memberID,
471+
}
472+
473+
// If the etcd cluster is being created it doesn't have a leader yet, so set this member as a leader.
474+
if info.leaderID == "" {
475+
etcdPod.Annotations[cloudv1.EtcdLeaderFromAnnotationName] = time.Now().Format(time.RFC3339)
476+
}
477+
447478
// NOTE: for the first control plane machine we might create the etcd pod before the API server pod is running
448479
// but this is not an issue, because it won't be visible to CAPI until the API server start serving requests.
449480
if err := cloudClient.Create(ctx, etcdPod); err != nil && !apierrors.IsAlreadyExists(err) {
@@ -487,6 +518,58 @@ func (r *InMemoryMachineReconciler) reconcileNormalETCD(ctx context.Context, clu
487518
return ctrl.Result{}, nil
488519
}
489520

521+
type etcdInfo struct {
522+
clusterID string
523+
leaderID string
524+
members sets.Set[string]
525+
}
526+
527+
func (r *InMemoryMachineReconciler) inspectEtcd(ctx context.Context, cloudClient cclient.Client) (etcdInfo, error) {
528+
etcdPods := &corev1.PodList{}
529+
if err := cloudClient.List(ctx, etcdPods,
530+
client.InNamespace(metav1.NamespaceSystem),
531+
client.MatchingLabels{
532+
"component": "etcd",
533+
"tier": "control-plane"},
534+
); err != nil {
535+
return etcdInfo{}, errors.Wrap(err, "failed to list etcd members")
536+
}
537+
538+
if len(etcdPods.Items) == 0 {
539+
return etcdInfo{}, nil
540+
}
541+
542+
info := etcdInfo{
543+
members: sets.New[string](),
544+
}
545+
var leaderFrom time.Time
546+
for _, pod := range etcdPods.Items {
547+
if info.clusterID == "" {
548+
info.clusterID = pod.Annotations[cloudv1.EtcdClusterIDAnnotationName]
549+
} else if pod.Annotations[cloudv1.EtcdClusterIDAnnotationName] != info.clusterID {
550+
return etcdInfo{}, errors.New("invalid etcd cluster, members have different cluster ID")
551+
}
552+
memberID := pod.Annotations[cloudv1.EtcdMemberIDAnnotationName]
553+
info.members.Insert(memberID)
554+
555+
if t, err := time.Parse(time.RFC3339, pod.Annotations[cloudv1.EtcdLeaderFromAnnotationName]); err == nil {
556+
if t.After(leaderFrom) {
557+
info.leaderID = memberID
558+
leaderFrom = t
559+
}
560+
}
561+
}
562+
563+
if info.leaderID == "" {
564+
// TODO: consider if and how to automatically recover from this case
565+
// note: this can happen also when reading etcd members in the server, might be it is something we have to take case before deletion...
566+
// for now it should not be an issue because KCP forward etcd leadership before deletion.
567+
return etcdInfo{}, errors.New("invalid etcd cluster, no leader found")
568+
}
569+
570+
return info, nil
571+
}
572+
490573
func (r *InMemoryMachineReconciler) reconcileNormalAPIServer(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine, inMemoryMachine *infrav1.InMemoryMachine) (ctrl.Result, error) {
491574
// No-op if the machine is not a control plane machine.
492575
if !util.IsControlPlaneMachine(machine) {

test/infrastructure/inmemory/internal/controllers/inmemorymachine_controller_test.go

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -253,13 +253,13 @@ func TestReconcileNormalNode(t *testing.T) {
253253
func TestReconcileNormalEtcd(t *testing.T) {
254254
inMemoryMachineWithNodeNotYetProvisioned := &infrav1.InMemoryMachine{
255255
ObjectMeta: metav1.ObjectMeta{
256-
Name: "bar",
256+
Name: "bar0",
257257
},
258258
}
259259

260-
inMemoryMachineWithNodeProvisioned := &infrav1.InMemoryMachine{
260+
inMemoryMachineWithNodeProvisioned1 := &infrav1.InMemoryMachine{
261261
ObjectMeta: metav1.ObjectMeta{
262-
Name: "bar",
262+
Name: "bar1",
263263
},
264264
Spec: infrav1.InMemoryMachineSpec{
265265
Behaviour: &infrav1.InMemoryMachineBehaviour{
@@ -313,7 +313,7 @@ func TestReconcileNormalEtcd(t *testing.T) {
313313

314314
manager := cmanager.New(scheme)
315315

316-
host := "127.0.0.1"
316+
host := "127.0.0.1" //nolint:goconst
317317
wcmux, err := server.NewWorkloadClustersMux(manager, host, server.CustomPorts{
318318
// NOTE: make sure to use ports different than other tests, so we can run tests in parallel
319319
MinPort: server.DefaultMinPort + 1000,
@@ -332,15 +332,15 @@ func TestReconcileNormalEtcd(t *testing.T) {
332332
r.CloudManager.AddResourceGroup(klog.KObj(cluster).String())
333333
c := r.CloudManager.GetResourceGroup(klog.KObj(cluster).String()).GetClient()
334334

335-
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned)
335+
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned1)
336336
g.Expect(err).ToNot(HaveOccurred())
337337
g.Expect(res.IsZero()).To(BeFalse())
338-
g.Expect(conditions.IsFalse(inMemoryMachineWithNodeProvisioned, infrav1.EtcdProvisionedCondition)).To(BeTrue())
338+
g.Expect(conditions.IsFalse(inMemoryMachineWithNodeProvisioned1, infrav1.EtcdProvisionedCondition)).To(BeTrue())
339339

340340
got := &corev1.Pod{
341341
ObjectMeta: metav1.ObjectMeta{
342342
Namespace: metav1.NamespaceSystem,
343-
Name: fmt.Sprintf("etcd-%s", inMemoryMachineWithNodeNotYetProvisioned.Name),
343+
Name: fmt.Sprintf("etcd-%s", inMemoryMachineWithNodeProvisioned1.Name),
344344
},
345345
}
346346
err = c.Get(ctx, client.ObjectKeyFromObject(got), got)
@@ -350,32 +350,111 @@ func TestReconcileNormalEtcd(t *testing.T) {
350350
g := NewWithT(t)
351351

352352
g.Eventually(func() bool {
353-
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned)
353+
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned1)
354354
g.Expect(err).ToNot(HaveOccurred())
355355
if !res.IsZero() {
356356
time.Sleep(res.RequeueAfter / 100 * 90)
357357
}
358358
return res.IsZero()
359-
}, inMemoryMachineWithNodeProvisioned.Spec.Behaviour.Etcd.Provisioning.StartupDuration.Duration*2).Should(BeTrue())
359+
}, inMemoryMachineWithNodeProvisioned1.Spec.Behaviour.Etcd.Provisioning.StartupDuration.Duration*2).Should(BeTrue())
360360

361361
err = c.Get(ctx, client.ObjectKeyFromObject(got), got)
362362
g.Expect(err).ToNot(HaveOccurred())
363363

364-
g.Expect(conditions.IsTrue(inMemoryMachineWithNodeProvisioned, infrav1.EtcdProvisionedCondition)).To(BeTrue())
365-
g.Expect(conditions.Get(inMemoryMachineWithNodeProvisioned, infrav1.EtcdProvisionedCondition).LastTransitionTime.Time).To(BeTemporally(">", conditions.Get(inMemoryMachineWithNodeProvisioned, infrav1.NodeProvisionedCondition).LastTransitionTime.Time, inMemoryMachineWithNodeProvisioned.Spec.Behaviour.Etcd.Provisioning.StartupDuration.Duration))
364+
g.Expect(got.Annotations).To(HaveKey(cloudv1.EtcdClusterIDAnnotationName))
365+
g.Expect(got.Annotations).To(HaveKey(cloudv1.EtcdMemberIDAnnotationName))
366+
g.Expect(got.Annotations).To(HaveKey(cloudv1.EtcdLeaderFromAnnotationName))
367+
368+
g.Expect(conditions.IsTrue(inMemoryMachineWithNodeProvisioned1, infrav1.EtcdProvisionedCondition)).To(BeTrue())
369+
g.Expect(conditions.Get(inMemoryMachineWithNodeProvisioned1, infrav1.EtcdProvisionedCondition).LastTransitionTime.Time).To(BeTemporally(">", conditions.Get(inMemoryMachineWithNodeProvisioned1, infrav1.NodeProvisionedCondition).LastTransitionTime.Time, inMemoryMachineWithNodeProvisioned1.Spec.Behaviour.Etcd.Provisioning.StartupDuration.Duration))
366370
})
367371

368372
t.Run("no-op after it is provisioned", func(t *testing.T) {
369373
g := NewWithT(t)
370374

371-
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned)
375+
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned1)
372376
g.Expect(err).ToNot(HaveOccurred())
373377
g.Expect(res.IsZero()).To(BeTrue())
374378
})
375379

376380
err = wcmux.Shutdown(ctx)
377381
g.Expect(err).ToNot(HaveOccurred())
378382
})
383+
384+
t.Run("takes care of the etcd cluster annotations", func(t *testing.T) {
385+
g := NewWithT(t)
386+
387+
inMemoryMachineWithNodeProvisioned1 := inMemoryMachineWithNodeProvisioned1.DeepCopy()
388+
inMemoryMachineWithNodeProvisioned1.Spec = infrav1.InMemoryMachineSpec{}
389+
390+
inMemoryMachineWithNodeProvisioned2 := inMemoryMachineWithNodeProvisioned1.DeepCopy()
391+
inMemoryMachineWithNodeProvisioned2.Name = "bar2"
392+
393+
manager := cmanager.New(scheme)
394+
395+
host := "127.0.0.1"
396+
wcmux, err := server.NewWorkloadClustersMux(manager, host, server.CustomPorts{
397+
// NOTE: make sure to use ports different than other tests, so we can run tests in parallel
398+
MinPort: server.DefaultMinPort + 1200,
399+
MaxPort: server.DefaultMinPort + 1299,
400+
DebugPort: server.DefaultDebugPort + 20,
401+
})
402+
g.Expect(err).ToNot(HaveOccurred())
403+
_, err = wcmux.InitWorkloadClusterListener(klog.KObj(cluster).String())
404+
g.Expect(err).ToNot(HaveOccurred())
405+
406+
r := InMemoryMachineReconciler{
407+
Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(createCASecret(t, cluster, secretutil.EtcdCA)).Build(),
408+
CloudManager: manager,
409+
APIServerMux: wcmux,
410+
}
411+
r.CloudManager.AddResourceGroup(klog.KObj(cluster).String())
412+
c := r.CloudManager.GetResourceGroup(klog.KObj(cluster).String()).GetClient()
413+
414+
// first etcd pod gets annotated with clusterID, memberID, and also set as a leader
415+
416+
res, err := r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned1)
417+
g.Expect(err).ToNot(HaveOccurred())
418+
g.Expect(res.IsZero()).To(BeTrue())
419+
g.Expect(conditions.IsTrue(inMemoryMachineWithNodeProvisioned1, infrav1.EtcdProvisionedCondition)).To(BeTrue())
420+
421+
got1 := &corev1.Pod{
422+
ObjectMeta: metav1.ObjectMeta{
423+
Namespace: metav1.NamespaceSystem,
424+
Name: fmt.Sprintf("etcd-%s", inMemoryMachineWithNodeProvisioned1.Name),
425+
},
426+
}
427+
428+
err = c.Get(ctx, client.ObjectKeyFromObject(got1), got1)
429+
g.Expect(err).ToNot(HaveOccurred())
430+
431+
g.Expect(got1.Annotations).To(HaveKey(cloudv1.EtcdClusterIDAnnotationName))
432+
g.Expect(got1.Annotations).To(HaveKey(cloudv1.EtcdMemberIDAnnotationName))
433+
g.Expect(got1.Annotations).To(HaveKey(cloudv1.EtcdLeaderFromAnnotationName))
434+
435+
// second etcd pod gets annotated with the same clusterID, a new memberID (but it is not set as a leader
436+
437+
res, err = r.reconcileNormalETCD(ctx, cluster, cpMachine, inMemoryMachineWithNodeProvisioned2)
438+
g.Expect(err).ToNot(HaveOccurred())
439+
g.Expect(res.IsZero()).To(BeTrue())
440+
g.Expect(conditions.IsTrue(inMemoryMachineWithNodeProvisioned2, infrav1.EtcdProvisionedCondition)).To(BeTrue())
441+
442+
got2 := &corev1.Pod{
443+
ObjectMeta: metav1.ObjectMeta{
444+
Namespace: metav1.NamespaceSystem,
445+
Name: fmt.Sprintf("etcd-%s", inMemoryMachineWithNodeProvisioned2.Name),
446+
},
447+
}
448+
449+
err = c.Get(ctx, client.ObjectKeyFromObject(got2), got2)
450+
g.Expect(err).ToNot(HaveOccurred())
451+
452+
g.Expect(got2.Annotations).To(HaveKey(cloudv1.EtcdClusterIDAnnotationName))
453+
g.Expect(got1.Annotations[cloudv1.EtcdClusterIDAnnotationName]).To(Equal(got2.Annotations[cloudv1.EtcdClusterIDAnnotationName]))
454+
g.Expect(got2.Annotations).To(HaveKey(cloudv1.EtcdMemberIDAnnotationName))
455+
g.Expect(got1.Annotations[cloudv1.EtcdMemberIDAnnotationName]).ToNot(Equal(got2.Annotations[cloudv1.EtcdMemberIDAnnotationName]))
456+
g.Expect(got2.Annotations).ToNot(HaveKey(cloudv1.EtcdLeaderFromAnnotationName))
457+
})
379458
}
380459

381460
func TestReconcileNormalApiServer(t *testing.T) {

test/infrastructure/inmemory/internal/server/etcd/handler.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3434
"sigs.k8s.io/controller-runtime/pkg/client"
3535

36+
cloudv1 "sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/cloud/api/v1alpha1"
3637
cclient "sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/cloud/runtime/client"
3738
cmanager "sigs.k8s.io/cluster-api/test/infrastructure/inmemory/internal/cloud/runtime/manager"
3839
)
@@ -192,19 +193,26 @@ func (b *baseServer) inspectEtcd(ctx context.Context, cloudClient cclient.Client
192193

193194
memberList := &pb.MemberListResponse{}
194195
statusResponse := &pb.StatusResponse{}
196+
var clusterID int
195197
var leaderID int
196198
var leaderFrom time.Time
197199
for _, pod := range etcdPods.Items {
198-
clusterID, err := strconv.Atoi(pod.Annotations["etcd.inmemory.infrastructure.cluster.x-k8s.io/cluster-id"])
199-
if err != nil {
200-
return nil, nil, errors.Wrapf(err, "failed read cluster ID annotation from etcd member with name %s", pod.Name)
200+
if clusterID == 0 {
201+
var err error
202+
clusterID, err = strconv.Atoi(pod.Annotations[cloudv1.EtcdClusterIDAnnotationName])
203+
if err != nil {
204+
return nil, nil, errors.Wrapf(err, "failed read cluster ID annotation from etcd member with name %s", pod.Name)
205+
}
206+
} else if pod.Annotations[cloudv1.EtcdClusterIDAnnotationName] != fmt.Sprintf("%d", clusterID) {
207+
return nil, nil, errors.New("invalid etcd cluster, members have different cluster ID")
201208
}
202-
memberID, err := strconv.Atoi(pod.Annotations["etcd.inmemory.infrastructure.cluster.x-k8s.io/member-id"])
209+
210+
memberID, err := strconv.Atoi(pod.Annotations[cloudv1.EtcdMemberIDAnnotationName])
203211
if err != nil {
204212
return nil, nil, errors.Wrapf(err, "failed read member ID annotation from etcd member with name %s", pod.Name)
205213
}
206214

207-
if t, err := time.Parse(time.RFC3339, pod.Annotations["etcd.inmemory.infrastructure.cluster.x-k8s.io/leader-from"]); err == nil {
215+
if t, err := time.Parse(time.RFC3339, pod.Annotations[cloudv1.EtcdLeaderFromAnnotationName]); err == nil {
208216
if t.After(leaderFrom) {
209217
leaderID = memberID
210218
leaderFrom = t
@@ -224,6 +232,13 @@ func (b *baseServer) inspectEtcd(ctx context.Context, cloudClient cclient.Client
224232
Name: strings.TrimPrefix(pod.Name, "etcd-"),
225233
})
226234
}
235+
236+
if leaderID == 0 {
237+
// TODO: consider if and how to automatically recover from this case
238+
// note: this can happen also when adding a new etcd members in the handler, might be it is something we have to take case before deletion...
239+
// for now it should not be an issue because KCP forwards etcd leadership before deletion.
240+
return nil, nil, errors.New("invalid etcd cluster, no leader found")
241+
}
227242
statusResponse.Leader = uint64(leaderID)
228243

229244
return memberList, statusResponse, nil

0 commit comments

Comments
 (0)