@@ -4,18 +4,22 @@ import (
44 "context"
55 "fmt"
66 "math/rand"
7+ "net"
78 "sort"
89 "strings"
910 "time"
1011
1112 clusterv1beta1 "github.com/canonical/cluster-api-control-plane-provider-microk8s/api/v1beta1"
13+ "github.com/canonical/cluster-api-control-plane-provider-microk8s/pkg/clusteragent"
14+ "github.com/canonical/cluster-api-control-plane-provider-microk8s/pkg/token"
1215 "golang.org/x/mod/semver"
1316
1417 "github.com/pkg/errors"
1518 corev1 "k8s.io/api/core/v1"
1619 apierrors "k8s.io/apimachinery/pkg/api/errors"
1720 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1821 kerrors "k8s.io/apimachinery/pkg/util/errors"
22+ "k8s.io/apimachinery/pkg/util/sets"
1923 "k8s.io/apiserver/pkg/storage/names"
2024 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2125 "sigs.k8s.io/cluster-api/controllers/external"
@@ -28,6 +32,12 @@ import (
2832 "sigs.k8s.io/controller-runtime/pkg/log"
2933)
3034
35+ const (
36+ defaultClusterAgentPort string = "25000"
37+ defaultDqlitePort string = "19001"
38+ defaultClusterAgentClientTimeout time.Duration = 10 * time .Second
39+ )
40+
3141type errServiceUnhealthy struct {
3242 service string
3343 reason string
@@ -466,7 +476,7 @@ func (r *MicroK8sControlPlaneReconciler) reconcileDelete(ctx context.Context, cl
466476 }
467477
468478 // clean up MicroK8s cluster secrets
469- for _ , secretName := range []string {"kubeconfig" , "ca" , "jointoken" } {
479+ for _ , secretName := range []string {"kubeconfig" , "ca" , "jointoken" , token . AuthTokenNameSuffix } {
470480 secret := & corev1.Secret {
471481 ObjectMeta : metav1.ObjectMeta {
472482 Namespace : cluster .Namespace ,
@@ -578,6 +588,28 @@ func (r *MicroK8sControlPlaneReconciler) scaleDownControlPlane(ctx context.Conte
578588 node := deleteMachine .Status .NodeRef
579589
580590 logger = logger .WithValues ("machineName" , deleteMachine .Name , "nodeName" , node .Name )
591+
592+ logger .Info ("deleting node from dqlite" , "machineName" , deleteMachine .Name , "nodeName" , node .Name )
593+
594+ // NOTE(Hue): We do this step as a best effort since this whole logic is implemented to prevent a not-yet-reported bug.
595+ // The issue is that we were not removing the endpoint from dqlite when we were deleting a machine.
596+ // This would cause a situation were a joining node failed to join because the endpoint was already in the dqlite cluster.
597+ // How? The IP assigned to the joining (new) node, previously belonged to a node that was deleted, but the IP is still there in dqlite.
598+ // If we have 2 machines, deleting one is not safe because it can be the leader and we're not taking care of
599+ // leadership transfers in the cluster-agent for now. Maybe something for later (TODO)
600+ // If we have 3 or more machines left, get cluster agent client and delete node from dqlite.
601+ if len (machines ) > 2 {
602+ portRemap := tcp != nil && tcp .Spec .ControlPlaneConfig .ClusterConfiguration != nil && tcp .Spec .ControlPlaneConfig .ClusterConfiguration .PortCompatibilityRemap
603+
604+ if clusterAgentClient , err := getClusterAgentClient (machines , deleteMachine , portRemap ); err == nil {
605+ if err := r .removeNodeFromDqlite (ctx , clusterAgentClient , cluster , deleteMachine , portRemap ); err != nil {
606+ logger .Error (err , "failed to remove node from dqlite: %w" , "machineName" , deleteMachine .Name , "nodeName" , node .Name )
607+ }
608+ } else {
609+ logger .Error (err , "failed to get cluster agent client" )
610+ }
611+ }
612+
581613 logger .Info ("deleting machine" )
582614
583615 err = r .Client .Delete (ctx , & deleteMachine )
@@ -595,6 +627,60 @@ func (r *MicroK8sControlPlaneReconciler) scaleDownControlPlane(ctx context.Conte
595627 return ctrl.Result {Requeue : true }, nil
596628}
597629
630+ func getClusterAgentClient (machines []clusterv1.Machine , delMachine clusterv1.Machine , portRemap bool ) (* clusteragent.Client , error ) {
631+ opts := clusteragent.Options {
632+ // NOTE(hue): We want to pick a random machine's IP to call POST /dqlite/remove on its cluster agent endpoint.
633+ // This machine should preferably not be the <delMachine> itself, although this is not forced by Microk8s.
634+ IgnoreMachineNames : sets .NewString (delMachine .Name ),
635+ }
636+
637+ port := defaultClusterAgentPort
638+ if portRemap {
639+ // https://github.com/canonical/cluster-api-control-plane-provider-microk8s/blob/v0.6.10/control-plane-components.yaml#L96-L102
640+ port = "30000"
641+ }
642+
643+ clusterAgentClient , err := clusteragent .NewClient (machines , port , defaultClusterAgentClientTimeout , opts )
644+ if err != nil {
645+ return nil , fmt .Errorf ("failed to initialize cluster agent client: %w" , err )
646+ }
647+
648+ return clusterAgentClient , nil
649+ }
650+
651+ // removeMicrok8sNode removes the node from
652+ func (r * MicroK8sControlPlaneReconciler ) removeNodeFromDqlite (ctx context.Context , clusterAgentClient * clusteragent.Client ,
653+ clusterKey client.ObjectKey , delMachine clusterv1.Machine , portRemap bool ) error {
654+ dqlitePort := defaultDqlitePort
655+ if portRemap {
656+ // https://github.com/canonical/cluster-api-control-plane-provider-microk8s/blob/v0.6.10/control-plane-components.yaml#L96-L102
657+ dqlitePort = "2379"
658+ }
659+
660+ var removeEp string
661+ for _ , addr := range delMachine .Status .Addresses {
662+ if net .ParseIP (addr .Address ) != nil {
663+ removeEp = fmt .Sprintf ("%s:%s" , addr .Address , dqlitePort )
664+ break
665+ }
666+ }
667+
668+ if removeEp == "" {
669+ return fmt .Errorf ("failed to extract endpoint of the deleting machine %q" , delMachine .Name )
670+ }
671+
672+ token , err := token .Lookup (ctx , r .Client , clusterKey )
673+ if err != nil {
674+ return fmt .Errorf ("failed to lookup token: %w" , err )
675+ }
676+
677+ if err := clusterAgentClient .RemoveNodeFromDqlite (ctx , token , removeEp ); err != nil {
678+ return fmt .Errorf ("failed to remove node %q from dqlite: %w" , removeEp , err )
679+ }
680+
681+ return nil
682+ }
683+
598684func createUpgradePod (ctx context.Context , kubeclient * kubernetesClient , nodeName string , nodeVersion string ) (* corev1.Pod , error ) {
599685 nodeVersion = strings .TrimPrefix (semver .MajorMinor (nodeVersion ), "v" )
600686
0 commit comments