Skip to content

Commit fb0257d

Browse files
rsmittytalos-bot
authored andcommitted
feat: audit and remove etcd members if machines no longer exist
This PR introduces some auditing functionality for etcd in the workload clusters. If there are members in the workload etcd that don't correspond to any CP node, the members will be removed. This helps in the case where a machine is deleted out from under us and a new CP machine comes up but can't join etcd. Signed-off-by: Spencer Smith <[email protected]>
1 parent 182f656 commit fb0257d

File tree

4 files changed

+187
-28
lines changed

4 files changed

+187
-28
lines changed

controllers/etcd.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package controllers
66

77
import (
88
"context"
9+
"fmt"
10+
"strings"
911

1012
"github.com/talos-systems/talos/pkg/machinery/api/machine"
1113
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
@@ -45,3 +47,108 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
4547

4648
return nil
4749
}
50+
51+
// forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
52+
// This is used in times when the machine was deleted out from under us.
53+
func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error {
54+
r.Log.Info("Removing etcd member", "memberName", memberName)
55+
56+
err := c.EtcdRemoveMember(
57+
ctx,
58+
&machine.EtcdRemoveMemberRequest{
59+
Member: memberName,
60+
},
61+
)
62+
if err != nil {
63+
return err
64+
}
65+
66+
return nil
67+
}
68+
69+
// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
70+
// It uses the first controlplane node returned as the etcd endpoint
71+
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster client.ObjectKey, cpName string) error {
72+
machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
73+
if err != nil {
74+
return err
75+
}
76+
77+
if len(machines) == 0 {
78+
return nil
79+
}
80+
81+
for _, machine := range machines {
82+
// nb: we'll assume any machine that doesn't have a noderef is new and we can audit later because
83+
// otherwise a new etcd member can get removed before even getting the noderef set by the CAPI controllers.
84+
if machine.Status.NodeRef == nil {
85+
return fmt.Errorf("some CP machines do not have a noderef")
86+
}
87+
}
88+
// Select the first CP machine that's not being deleted and has a noderef
89+
var designatedCPMachine capiv1.Machine
90+
91+
for _, machine := range machines {
92+
if machine.ObjectMeta.DeletionTimestamp.IsZero() && machine.Status.NodeRef != nil {
93+
designatedCPMachine = machine
94+
break
95+
}
96+
}
97+
98+
clientset, err := r.kubeconfigForCluster(ctx, cluster)
99+
if err != nil {
100+
return err
101+
}
102+
103+
c, err := r.talosconfigForMachine(ctx, clientset, designatedCPMachine)
104+
if err != nil {
105+
return err
106+
}
107+
108+
// Save the first internal IP of the designated machine to use as our node target
109+
// and setup the ctx to target it
110+
var firstIntAddr string
111+
112+
for _, addr := range designatedCPMachine.Status.Addresses {
113+
if addr.Type == capiv1.MachineInternalIP {
114+
firstIntAddr = addr.Address
115+
break
116+
}
117+
}
118+
119+
nodeCtx := talosclient.WithNodes(ctx, firstIntAddr)
120+
121+
response, err := c.EtcdMemberList(nodeCtx, &machine.EtcdMemberListRequest{})
122+
if err != nil {
123+
return err
124+
}
125+
126+
// Only querying one CP node, so only 1 message should return.
127+
memberList := response.Messages[0]
128+
129+
if len(memberList.Members) == 0 {
130+
return nil
131+
}
132+
133+
// For each etcd member, look through the list of machines and see if noderef matches
134+
for _, member := range memberList.Members {
135+
present := false
136+
for _, machine := range machines {
137+
// break apart the noderef name in case it's an fqdn (like in AWS)
138+
machineNodeNameExploded := strings.Split(machine.Status.NodeRef.Name, ".")
139+
140+
if machineNodeNameExploded[0] == member {
141+
present = true
142+
break
143+
}
144+
}
145+
146+
if !present {
147+
r.Log.Info("found etcd member that doesn't exist as controlplane machine", "member", member)
148+
149+
r.forceEtcdLeave(nodeCtx, c, cluster, member)
150+
}
151+
}
152+
153+
return nil
154+
}

controllers/taloscontrolplane_controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,12 @@ func (r *TalosControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Resu
245245
return ctrl.Result{}, err
246246
}
247247

248+
// Audit the etcd member list to remove any nodes that no longer exist
249+
if err := r.auditEtcd(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name); err != nil {
250+
logger.Info("failed to check etcd membership list", "error", err)
251+
return ctrl.Result{Requeue: true}, nil
252+
}
253+
248254
return ctrl.Result{}, nil
249255
}
250256

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@ go 1.16
55
require (
66
cloud.google.com/go v0.47.0 // indirect
77
github.com/go-logr/logr v0.1.0
8-
github.com/onsi/ginkgo v1.12.1
8+
github.com/onsi/ginkgo v1.15.0
99
github.com/onsi/gomega v1.10.1
1010
github.com/pkg/errors v0.9.1
11-
github.com/talos-systems/cluster-api-bootstrap-provider-talos v0.2.0-alpha.10
12-
github.com/talos-systems/talos/pkg/machinery v0.0.0-20210218160848-32d25885288f
11+
github.com/talos-systems/cluster-api-bootstrap-provider-talos v0.2.0-alpha.12
12+
github.com/talos-systems/talos/pkg/machinery v0.0.0-20210520203624-828772cec9a3
1313
k8s.io/api v0.17.9
1414
k8s.io/apimachinery v0.17.9
1515
k8s.io/apiserver v0.17.9

0 commit comments

Comments
 (0)