@@ -6,6 +6,8 @@ package controllers
6
6
7
7
import (
8
8
"context"
9
+ "fmt"
10
+ "strings"
9
11
10
12
"github.com/talos-systems/talos/pkg/machinery/api/machine"
11
13
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
@@ -45,3 +47,108 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
45
47
46
48
return nil
47
49
}
50
+
51
+ // forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
52
+ // This is used in times when the machine was deleted out from under us.
53
+ func (r * TalosControlPlaneReconciler ) forceEtcdLeave (ctx context.Context , c * talosclient.Client , cluster client.ObjectKey , memberName string ) error {
54
+ r .Log .Info ("Removing etcd member" , "memberName" , memberName )
55
+
56
+ err := c .EtcdRemoveMember (
57
+ ctx ,
58
+ & machine.EtcdRemoveMemberRequest {
59
+ Member : memberName ,
60
+ },
61
+ )
62
+ if err != nil {
63
+ return err
64
+ }
65
+
66
+ return nil
67
+ }
68
+
69
+ // auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
70
+ // It uses the first controlplane node returned as the etcd endpoint
71
+ func (r * TalosControlPlaneReconciler ) auditEtcd (ctx context.Context , cluster client.ObjectKey , cpName string ) error {
72
+ machines , err := r .getControlPlaneMachinesForCluster (ctx , cluster , cpName )
73
+ if err != nil {
74
+ return err
75
+ }
76
+
77
+ if len (machines ) == 0 {
78
+ return nil
79
+ }
80
+
81
+ for _ , machine := range machines {
82
+ // nb: we'll assume any machine that doesn't have a noderef is new and we can audit later because
83
+ // otherwise a new etcd member can get removed before even getting the noderef set by the CAPI controllers.
84
+ if machine .Status .NodeRef == nil {
85
+ return fmt .Errorf ("some CP machines do not have a noderef" )
86
+ }
87
+ }
88
+ // Select the first CP machine that's not being deleted and has a noderef
89
+ var designatedCPMachine capiv1.Machine
90
+
91
+ for _ , machine := range machines {
92
+ if machine .ObjectMeta .DeletionTimestamp .IsZero () && machine .Status .NodeRef != nil {
93
+ designatedCPMachine = machine
94
+ break
95
+ }
96
+ }
97
+
98
+ clientset , err := r .kubeconfigForCluster (ctx , cluster )
99
+ if err != nil {
100
+ return err
101
+ }
102
+
103
+ c , err := r .talosconfigForMachine (ctx , clientset , designatedCPMachine )
104
+ if err != nil {
105
+ return err
106
+ }
107
+
108
+ // Save the first internal IP of the designated machine to use as our node target
109
+ // and setup the ctx to target it
110
+ var firstIntAddr string
111
+
112
+ for _ , addr := range designatedCPMachine .Status .Addresses {
113
+ if addr .Type == capiv1 .MachineInternalIP {
114
+ firstIntAddr = addr .Address
115
+ break
116
+ }
117
+ }
118
+
119
+ nodeCtx := talosclient .WithNodes (ctx , firstIntAddr )
120
+
121
+ response , err := c .EtcdMemberList (nodeCtx , & machine.EtcdMemberListRequest {})
122
+ if err != nil {
123
+ return err
124
+ }
125
+
126
+ // Only querying one CP node, so only 1 message should return.
127
+ memberList := response .Messages [0 ]
128
+
129
+ if len (memberList .Members ) == 0 {
130
+ return nil
131
+ }
132
+
133
+ // For each etcd member, look through the list of machines and see if noderef matches
134
+ for _ , member := range memberList .Members {
135
+ present := false
136
+ for _ , machine := range machines {
137
+ // break apart the noderef name in case it's an fqdn (like in AWS)
138
+ machineNodeNameExploded := strings .Split (machine .Status .NodeRef .Name , "." )
139
+
140
+ if machineNodeNameExploded [0 ] == member {
141
+ present = true
142
+ break
143
+ }
144
+ }
145
+
146
+ if ! present {
147
+ r .Log .Info ("found etcd member that doesn't exist as controlplane machine" , "member" , member )
148
+
149
+ r .forceEtcdLeave (nodeCtx , c , cluster , member )
150
+ }
151
+ }
152
+
153
+ return nil
154
+ }
0 commit comments