@@ -11,15 +11,14 @@ import (
11
11
"time"
12
12
13
13
controlplanev1 "github.com/siderolabs/cluster-api-control-plane-provider-talos/api/v1alpha3"
14
- "github.com/siderolabs/talos/pkg/machinery/api/machine"
14
+ machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
15
15
talosclient "github.com/siderolabs/talos/pkg/machinery/client"
16
16
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
17
17
"sigs.k8s.io/controller-runtime/pkg/client"
18
18
)
19
19
20
20
func (r * TalosControlPlaneReconciler ) etcdHealthcheck (ctx context.Context , tcp * controlplanev1.TalosControlPlane , ownedMachines []clusterv1.Machine ) error {
21
21
ctx , cancel := context .WithTimeout (ctx , time .Second * 5 )
22
-
23
22
defer cancel ()
24
23
25
24
machines := []clusterv1.Machine {}
@@ -30,70 +29,80 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *
30
29
}
31
30
}
32
31
33
- c , err := r .talosconfigForMachines (ctx , tcp , machines ... )
34
- if err != nil {
35
- return err
36
- }
37
-
38
- defer c .Close () //nolint:errcheck
39
-
40
- service := "etcd"
41
-
42
- params := make ([]interface {}, 0 , len (machines )* 2 )
32
+ params := make ([]any , 0 , len (machines )* 2 )
43
33
for _ , machine := range machines {
44
34
params = append (params , "node" , machine .Name )
45
35
}
46
36
47
37
r .Log .Info ("verifying etcd health on all nodes" , params ... )
48
38
49
- svcs , err := c .ServiceInfo (ctx , service )
50
- if err != nil {
51
- return err
52
- }
39
+ const service = "etcd"
53
40
54
- // check that etcd service is healthy on all nodes
55
- for _ , svc := range svcs {
56
- node := svc .Metadata .GetHostname ()
41
+ // list of discovered etcd members, updated on each iteration
42
+ members := map [string ]struct {}{}
57
43
58
- if len (svc .Service .Events .Events ) == 0 {
59
- return fmt .Errorf ("%s: no events recorded yet for service %q" , node , service )
60
- }
44
+ for i , machine := range machines {
45
+ // loop for each machine, the client created has endpoints which point to a single machine
46
+ if err := func () error {
47
+ c , err := r .talosconfigForMachines (ctx , tcp , machine )
48
+ if err != nil {
49
+ return err
50
+ }
61
51
62
- lastEvent := svc .Service .Events .Events [len (svc .Service .Events .Events )- 1 ]
63
- if lastEvent .State != "Running" {
64
- return fmt .Errorf ("%s: service %q not in expected state %q: current state [%s] %s" , node , service , "Running" , lastEvent .State , lastEvent .Msg )
65
- }
52
+ defer c .Close () //nolint:errcheck
66
53
67
- if ! svc . Service . GetHealth (). GetHealthy () {
68
- return fmt . Errorf ( "%s: service is not healthy: %s" , node , service )
69
- }
70
- }
54
+ svcs , err := c . ServiceInfo ( ctx , service )
55
+ if err != nil {
56
+ return err
57
+ }
71
58
72
- resp , err := c .EtcdMemberList (ctx , & machine.EtcdMemberListRequest {})
73
- if err != nil {
74
- return err
75
- }
59
+ // check that etcd service is healthy on the node
60
+ for _ , svc := range svcs {
61
+ node := svc .Metadata .GetHostname ()
76
62
77
- members := map [string ]struct {}{}
63
+ if len (svc .Service .Events .Events ) == 0 {
64
+ return fmt .Errorf ("%s: no events recorded yet for service %q" , node , service )
65
+ }
66
+
67
+ lastEvent := svc .Service .Events .Events [len (svc .Service .Events .Events )- 1 ]
68
+ if lastEvent .State != "Running" {
69
+ return fmt .Errorf ("%s: service %q not in expected state %q: current state [%s] %s" , node , service , "Running" , lastEvent .State , lastEvent .Msg )
70
+ }
71
+
72
+ if ! svc .Service .GetHealth ().GetHealthy () {
73
+ return fmt .Errorf ("%s: service is not healthy: %s" , node , service )
74
+ }
75
+ }
76
+
77
+ resp , err := c .EtcdMemberList (ctx , & machineapi.EtcdMemberListRequest {})
78
+ if err != nil {
79
+ return err
80
+ }
78
81
79
- for i , message := range resp .Messages {
80
- actualMembers := len (message .Members )
81
- expectedMembers := len (machines )
82
+ for _ , message := range resp .Messages {
83
+ actualMembers := len (message .Members )
84
+ expectedMembers := len (machines )
82
85
83
- node := message .Metadata .GetHostname ()
86
+ node := message .Metadata .GetHostname ()
84
87
85
- // check that the count of members is the same on all nodes
86
- if actualMembers != expectedMembers {
87
- return fmt .Errorf ("%s: expected to have %d members, got %d" , node , expectedMembers , actualMembers )
88
- }
88
+ // check that the count of members is the same on all nodes
89
+ if actualMembers != expectedMembers {
90
+ return fmt .Errorf ("%s: expected to have %d members, got %d" , node , expectedMembers , actualMembers )
91
+ }
89
92
90
- // check that member list is the same on all nodes
91
- for _ , member := range message .Members {
92
- if _ , found := members [member .Hostname ]; i > 0 && ! found {
93
- return fmt .Errorf ("%s: found extra etcd member %s" , node , member .Hostname )
93
+ // check that member list is the same on all nodes
94
+ for _ , member := range message .Members {
95
+ if _ , found := members [member .Hostname ]; i > 0 && ! found {
96
+ return fmt .Errorf ("%s: found extra etcd member %s" , node , member .Hostname )
97
+ }
98
+
99
+ members [member .Hostname ] = struct {}{}
100
+ }
94
101
}
95
102
96
- members [member .Hostname ] = struct {}{}
103
+ return nil
104
+ }(); err != nil {
105
+ return fmt .Errorf ("error checking etcd health on machine %q: %w" , machines [i ].Name , err )
97
106
}
98
107
}
99
108
@@ -118,14 +127,14 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
118
127
if svc .Service .State != "Finished" {
119
128
r .Log .Info ("forfeiting leadership" , "machine" , machineToLeave .Status .NodeRef .Name )
120
129
121
- _ , err = c .EtcdForfeitLeadership (ctx , & machine .EtcdForfeitLeadershipRequest {})
130
+ _ , err = c .EtcdForfeitLeadership (ctx , & machineapi .EtcdForfeitLeadershipRequest {})
122
131
if err != nil {
123
132
return err
124
133
}
125
134
126
135
r .Log .Info ("leaving etcd" , "machine" , machineToLeave .Name , "node" , machineToLeave .Status .NodeRef .Name )
127
136
128
- err = c .EtcdLeaveCluster (ctx , & machine .EtcdLeaveClusterRequest {})
137
+ err = c .EtcdLeaveCluster (ctx , & machineapi .EtcdLeaveClusterRequest {})
129
138
if err != nil {
130
139
return err
131
140
}
@@ -137,7 +146,7 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
137
146
138
147
// forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
139
148
// This is used in times when the machine was deleted out from under us.
140
- func (r * TalosControlPlaneReconciler ) forceEtcdLeave (ctx context.Context , c * talosclient.Client , member * machine .EtcdMember ) error {
149
+ func (r * TalosControlPlaneReconciler ) forceEtcdLeave (ctx context.Context , c * talosclient.Client , member * machineapi .EtcdMember ) error {
141
150
ctx , cancel := context .WithTimeout (ctx , time .Second * 5 )
142
151
143
152
defer cancel ()
@@ -146,7 +155,7 @@ func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *tal
146
155
147
156
return c .EtcdRemoveMemberByID (
148
157
ctx ,
149
- & machine .EtcdRemoveMemberByIDRequest {
158
+ & machineapi .EtcdRemoveMemberByIDRequest {
150
159
MemberId : member .Id ,
151
160
},
152
161
)
@@ -199,7 +208,7 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, tcp *contro
199
208
200
209
defer c .Close () //nolint:errcheck
201
210
202
- response , err := c .EtcdMemberList (ctx , & machine .EtcdMemberListRequest {})
211
+ response , err := c .EtcdMemberList (ctx , & machineapi .EtcdMemberListRequest {})
203
212
if err != nil {
204
213
return fmt .Errorf ("error getting etcd members via %q (endpoints %v): %w" , designatedCPMachine .Name , c .GetConfigContext ().Endpoints , err )
205
214
}
0 commit comments