Skip to content

Commit 6b8c220

Browse files
nasusobaqliang
andauthored
add etcd reconcile logic (#103)
add machine controller to reconcile node etcd removal Add comment and small fix slightly fixed the comment Signed-off-by: nasusoba <[email protected]> Co-authored-by: qliang <[email protected]>
1 parent 286bede commit 6b8c220

File tree

9 files changed

+459
-64
lines changed

9 files changed

+459
-64
lines changed

controlplane/controllers/const.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,6 @@ const (
3030
// dependentCertRequeueAfter is how long to wait before checking again to see if
3131
// dependent certificates have been created.
3232
dependentCertRequeueAfter = 30 * time.Second
33+
34+
k3sHookName = "k3s"
3335
)

controlplane/controllers/kthreescontrolplane_controller.go

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ package controllers
1818

1919
import (
2020
"context"
21-
"errors"
2221
"fmt"
2322
"strings"
2423
"time"
2524

2625
"github.com/go-logr/logr"
26+
"github.com/pkg/errors"
2727
corev1 "k8s.io/api/core/v1"
2828
apierrors "k8s.io/apimachinery/pkg/api/errors"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -480,9 +480,9 @@ func (r *KThreesControlPlaneReconciler) reconcile(ctx context.Context, cluster *
480480

481481
// Ensures the number of etcd members is in sync with the number of machines/nodes.
482482
// NOTE: This is usually required after a machine deletion.
483-
// if result, err := r.reconcileEtcdMembers(ctx, controlPlane); err != nil || !result.IsZero() {
484-
// return result, err
485-
// }
483+
if err := r.reconcileEtcdMembers(ctx, controlPlane); err != nil {
484+
return reconcile.Result{}, err
485+
}
486486

487487
// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
488488
// otherwise continue with the other KCP operations.
@@ -655,6 +655,57 @@ func (r *KThreesControlPlaneReconciler) reconcileControlPlaneConditions(ctx cont
655655
return nil
656656
}
657657

658+
// reconcileEtcdMembers ensures the number of etcd members is in sync with the number of machines/nodes.
659+
// This is usually required after a machine deletion.
660+
//
661+
// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this.
662+
func (r *KThreesControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context, controlPlane *k3s.ControlPlane) error {
663+
log := ctrl.LoggerFrom(ctx)
664+
665+
// If etcd is not managed by KCP this is a no-op.
666+
if !controlPlane.IsEtcdManaged() {
667+
return nil
668+
}
669+
670+
// If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet.
671+
if controlPlane.Machines.Len() == 0 {
672+
return nil
673+
}
674+
675+
// Collect all the node names.
676+
nodeNames := []string{}
677+
for _, machine := range controlPlane.Machines {
678+
if machine.Status.NodeRef == nil {
679+
// If there are provisioning machines (machines without a node yet), return.
680+
return nil
681+
}
682+
nodeNames = append(nodeNames, machine.Status.NodeRef.Name)
683+
}
684+
685+
// Potential inconsistencies between the list of members and the list of machines/nodes are
686+
// surfaced using the EtcdClusterHealthyCondition; if this condition is true, meaning no inconsistencies exists, return early.
687+
if conditions.IsTrue(controlPlane.KCP, controlplanev1.EtcdClusterHealthyCondition) {
688+
return nil
689+
}
690+
691+
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
692+
if err != nil {
693+
// Failing at connecting to the workload cluster can mean workload cluster is unhealthy for a variety of reasons such as etcd quorum loss.
694+
return errors.Wrap(err, "cannot get remote client to workload cluster")
695+
}
696+
697+
removedMembers, err := workloadCluster.ReconcileEtcdMembers(ctx, nodeNames)
698+
if err != nil {
699+
return errors.Wrap(err, "failed attempt to reconcile etcd members")
700+
}
701+
702+
if len(removedMembers) > 0 {
703+
log.Info("Etcd members without nodes removed from the cluster", "members", removedMembers)
704+
}
705+
706+
return nil
707+
}
708+
658709
func (r *KThreesControlPlaneReconciler) upgradeControlPlane(
659710
ctx context.Context,
660711
cluster *clusterv1.Cluster,
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
package controllers
2+
3+
import (
4+
"context"
5+
"time"
6+
7+
"github.com/go-logr/logr"
8+
"github.com/pkg/errors"
9+
apierrors "k8s.io/apimachinery/pkg/api/errors"
10+
"k8s.io/apimachinery/pkg/runtime"
11+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
12+
"sigs.k8s.io/cluster-api/util"
13+
"sigs.k8s.io/cluster-api/util/annotations"
14+
"sigs.k8s.io/cluster-api/util/conditions"
15+
"sigs.k8s.io/cluster-api/util/patch"
16+
ctrl "sigs.k8s.io/controller-runtime"
17+
"sigs.k8s.io/controller-runtime/pkg/client"
18+
19+
k3s "github.com/k3s-io/cluster-api-k3s/pkg/k3s"
20+
)
21+
22+
// KThreesControlPlaneReconciler reconciles a KThreesControlPlane object.
23+
type MachineReconciler struct {
24+
client.Client
25+
Log logr.Logger
26+
Scheme *runtime.Scheme
27+
28+
EtcdDialTimeout time.Duration
29+
EtcdCallTimeout time.Duration
30+
31+
managementCluster k3s.ManagementCluster
32+
managementClusterUncached k3s.ManagementCluster
33+
}
34+
35+
func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, log *logr.Logger) error {
36+
_, err := ctrl.NewControllerManagedBy(mgr).
37+
For(&clusterv1.Machine{}).
38+
Build(r)
39+
40+
if r.managementCluster == nil {
41+
r.managementCluster = &k3s.Management{
42+
Client: r.Client,
43+
EtcdDialTimeout: r.EtcdDialTimeout,
44+
EtcdCallTimeout: r.EtcdCallTimeout,
45+
}
46+
}
47+
48+
if r.managementClusterUncached == nil {
49+
r.managementClusterUncached = &k3s.Management{
50+
Client: mgr.GetAPIReader(),
51+
EtcdDialTimeout: r.EtcdDialTimeout,
52+
EtcdCallTimeout: r.EtcdCallTimeout,
53+
}
54+
}
55+
56+
return err
57+
}
58+
59+
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters;clusters/status,verbs=get;list;watch
60+
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;create;update;patch;delete
61+
func (r *MachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
62+
logger := r.Log.WithValues("namespace", req.Namespace, "machine", req.Name)
63+
64+
m := &clusterv1.Machine{}
65+
if err := r.Client.Get(ctx, req.NamespacedName, m); err != nil {
66+
if apierrors.IsNotFound(err) {
67+
// Object not found, return. Created objects are automatically garbage collected.
68+
// For additional cleanup logic use finalizers.
69+
return ctrl.Result{}, nil
70+
}
71+
72+
// Error reading the object - requeue the request.
73+
return ctrl.Result{}, err
74+
}
75+
76+
if m.DeletionTimestamp.IsZero() {
77+
return ctrl.Result{}, nil
78+
}
79+
80+
// if machine registered PreTerminate hook, wait for capi to drain and deattach volume, then remove etcd member
81+
if annotations.HasWithPrefix(clusterv1.PreTerminateDeleteHookAnnotationPrefix, m.ObjectMeta.Annotations) &&
82+
m.ObjectMeta.Annotations[clusterv1.PreTerminateDeleteHookAnnotationPrefix] == k3sHookName {
83+
if !conditions.IsTrue(m, clusterv1.DrainingSucceededCondition) || !conditions.IsTrue(m, clusterv1.VolumeDetachSucceededCondition) {
84+
logger.Info("wait for machine drain and detech volume operation complete.")
85+
return ctrl.Result{}, nil
86+
}
87+
88+
cluster, err := util.GetClusterFromMetadata(ctx, r.Client, m.ObjectMeta)
89+
if err != nil {
90+
logger.Info("unable to get cluster.")
91+
return ctrl.Result{}, errors.Wrapf(err, "unable to get cluster")
92+
}
93+
94+
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
95+
if err != nil {
96+
logger.Error(err, "failed to create client to workload cluster")
97+
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
98+
}
99+
100+
etcdRemoved, err := workloadCluster.RemoveEtcdMemberForMachine(ctx, m)
101+
if err != nil {
102+
logger.Error(err, "failed to remove etcd member for machine")
103+
return ctrl.Result{}, err
104+
}
105+
if !etcdRemoved {
106+
logger.Info("wait k3s embedded etcd controller to remove etcd")
107+
return ctrl.Result{Requeue: true}, err
108+
}
109+
logger.Info("etcd remove etcd member succeeded", "node", m.Status.NodeRef.Name)
110+
111+
patchHelper, err := patch.NewHelper(m, r.Client)
112+
if err != nil {
113+
return ctrl.Result{}, errors.Wrapf(err, "failed to create patch helper for machine")
114+
}
115+
116+
mAnnotations := m.GetAnnotations()
117+
delete(mAnnotations, clusterv1.PreTerminateDeleteHookAnnotationPrefix)
118+
m.SetAnnotations(mAnnotations)
119+
if err := patchHelper.Patch(ctx, m); err != nil {
120+
return ctrl.Result{}, errors.Wrapf(err, "failed patch machine")
121+
}
122+
}
123+
124+
return ctrl.Result{}, nil
125+
}

controlplane/controllers/remediation.go

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
kerrors "k8s.io/apimachinery/pkg/util/errors"
2929
"k8s.io/klog/v2"
3030
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
31+
"sigs.k8s.io/cluster-api/util"
3132
"sigs.k8s.io/cluster-api/util/annotations"
3233
"sigs.k8s.io/cluster-api/util/conditions"
3334
"sigs.k8s.io/cluster-api/util/patch"
@@ -167,13 +168,10 @@ func (r *KThreesControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
167168
// Start remediating the unhealthy control plane machine by deleting it.
168169
// A new machine will come up completing the operation as part of the regular reconcile.
169170

170-
// TODO figure out etcd complexities
171171
// If the control plane is initialized, before deleting the machine:
172172
// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
173173
// - delete the etcd member hosted on the machine being deleted.
174-
// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
175-
/**
176-
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
174+
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
177175
if err != nil {
178176
log.Error(err, "Failed to create client to workload cluster")
179177
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
@@ -193,23 +191,20 @@ func (r *KThreesControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
193191
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
194192
return ctrl.Result{}, err
195193
}
196-
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
197-
log.Error(err, "Failed to remove etcd member for machine")
198-
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
199-
return ctrl.Result{}, err
194+
195+
patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
196+
if err != nil {
197+
return ctrl.Result{}, errors.Wrapf(err, "failed to create patch helper for machine")
200198
}
201-
}
202199

203-
parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
204-
if err != nil {
205-
return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
206-
}
200+
mAnnotations := machineToBeRemediated.GetAnnotations()
201+
mAnnotations[clusterv1.PreTerminateDeleteHookAnnotationPrefix] = k3sHookName
202+
machineToBeRemediated.SetAnnotations(mAnnotations)
207203

208-
if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
209-
log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
210-
return ctrl.Result{}, err
204+
if err := patchHelper.Patch(ctx, machineToBeRemediated); err != nil {
205+
return ctrl.Result{}, errors.Wrapf(err, "failed patch machine for adding preTerminate hook")
206+
}
211207
}
212-
**/
213208
}
214209

215210
// Delete the machine

controlplane/controllers/scale.go

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ package controllers
1919
import (
2020
"context"
2121
"encoding/json"
22-
"errors"
2322
"fmt"
2423
"strings"
2524

25+
"github.com/pkg/errors"
2626
corev1 "k8s.io/api/core/v1"
2727
apierrors "k8s.io/apimachinery/pkg/api/errors"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -33,6 +33,7 @@ import (
3333
"sigs.k8s.io/cluster-api/controllers/external"
3434
"sigs.k8s.io/cluster-api/util"
3535
"sigs.k8s.io/cluster-api/util/conditions"
36+
"sigs.k8s.io/cluster-api/util/patch"
3637
ctrl "sigs.k8s.io/controller-runtime"
3738

3839
bootstrapv1 "github.com/k3s-io/cluster-api-k3s/bootstrap/api/v1beta1"
@@ -132,12 +133,19 @@ func (r *KThreesControlPlaneReconciler) scaleDownControlPlane(
132133
logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
133134
return ctrl.Result{}, err
134135
}
135-
logger.Info("etcd move etcd leader succeeded, node to delete %s", machineToDelete.Status.NodeRef.Name)
136-
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToDelete); err != nil {
137-
logger.Error(err, "Failed to remove etcd member for machine")
138-
return ctrl.Result{}, err
136+
137+
patchHelper, err := patch.NewHelper(machineToDelete, r.Client)
138+
if err != nil {
139+
return ctrl.Result{}, errors.Wrapf(err, "failed to create patch helper for machine")
140+
}
141+
142+
mAnnotations := machineToDelete.GetAnnotations()
143+
mAnnotations[clusterv1.PreTerminateDeleteHookAnnotationPrefix] = k3sHookName
144+
machineToDelete.SetAnnotations(mAnnotations)
145+
146+
if err := patchHelper.Patch(ctx, machineToDelete); err != nil {
147+
return ctrl.Result{}, errors.Wrapf(err, "failed patch machine for adding preTerminate hook")
139148
}
140-
logger.Info("etcd remove etcd member succeeded, node to delete %s", machineToDelete.Status.NodeRef.Name)
141149
}
142150

143151
logger = logger.WithValues("machine", machineToDelete)
@@ -177,6 +185,12 @@ func (r *KThreesControlPlaneReconciler) preflightChecks(_ context.Context, contr
177185

178186
// Check machine health conditions; if there are conditions with False or Unknown, then wait.
179187
allMachineHealthConditions := []clusterv1.ConditionType{controlplanev1.MachineAgentHealthyCondition}
188+
if controlPlane.IsEtcdManaged() {
189+
allMachineHealthConditions = append(allMachineHealthConditions,
190+
controlplanev1.MachineEtcdMemberHealthyCondition,
191+
)
192+
}
193+
180194
machineErrors := []error{}
181195

182196
loopmachines:

controlplane/main.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,18 @@ func main() {
110110
setupLog.Error(err, "unable to create controller", "controller", "KThreesControlPlane")
111111
os.Exit(1)
112112
}
113+
114+
ctrMachineLogger := ctrl.Log.WithName("controllers").WithName("Machine")
115+
if err = (&controllers.MachineReconciler{
116+
Client: mgr.GetClient(),
117+
Log: ctrMachineLogger,
118+
Scheme: mgr.GetScheme(),
119+
EtcdDialTimeout: etcdDialTimeout,
120+
EtcdCallTimeout: etcdCallTimeout,
121+
}).SetupWithManager(ctx, mgr, &ctrMachineLogger); err != nil {
122+
setupLog.Error(err, "unable to create controller", "controller", "Machine")
123+
os.Exit(1)
124+
}
113125
// +kubebuilder:scaffold:builder
114126

115127
setupLog.Info("starting manager")

0 commit comments

Comments
 (0)