@@ -4,18 +4,21 @@ import (
44 "context"
55 "fmt"
66 "math/rand"
7+ "net"
78 "sort"
89 "strings"
910 "time"
1011
1112 clusterv1beta1 "github.com/canonical/cluster-api-control-plane-provider-microk8s/api/v1beta1"
13+ "github.com/canonical/cluster-api-control-plane-provider-microk8s/pkg/clusteragent"
1214 "golang.org/x/mod/semver"
1315
1416 "github.com/pkg/errors"
1517 corev1 "k8s.io/api/core/v1"
1618 apierrors "k8s.io/apimachinery/pkg/api/errors"
1719 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1820 kerrors "k8s.io/apimachinery/pkg/util/errors"
21+ "k8s.io/apimachinery/pkg/util/sets"
1922 "k8s.io/apiserver/pkg/storage/names"
2023 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2124 "sigs.k8s.io/cluster-api/controllers/external"
@@ -28,6 +31,12 @@ import (
2831 "sigs.k8s.io/controller-runtime/pkg/log"
2932)
3033
34+ const (
35+ defaultClusterAgentPort string = "25000"
36+ defaultDqlitePort string = "19001"
37+ defaultClusterAgentClientTimeout time.Duration = 10 * time .Second
38+ )
39+
3140type errServiceUnhealthy struct {
3241 service string
3342 reason string
@@ -578,6 +587,26 @@ func (r *MicroK8sControlPlaneReconciler) scaleDownControlPlane(ctx context.Conte
578587 node := deleteMachine .Status .NodeRef
579588
580589 logger = logger .WithValues ("machineName" , deleteMachine .Name , "nodeName" , node .Name )
590+
591+ logger .Info ("deleting node from dqlite" , "machineName" , deleteMachine .Name , "nodeName" , node .Name )
592+
593+ // NOTE(Hue): We do this step as a best effort since this whole logic is implemented to prevent a not-yet-reported bug.
594+ // The issue is that we were not removing the endpoint from dqlite when we were deleting a machine.
595+ // This would cause a situation were a joining node failed to join because the endpoint was already in the dqlite cluster.
596+ // How? The IP assigned to the joining (new) node, previously belonged to a node that was deleted, but the IP is still there in dqlite.
597+ // If we have 2 or more machines left, get cluster agent client and delete node from dqlite
598+ if len (machines ) > 1 {
599+ portRemap := tcp != nil && tcp .Spec .ControlPlaneConfig .ClusterConfiguration != nil && tcp .Spec .ControlPlaneConfig .ClusterConfiguration .PortCompatibilityRemap
600+
601+ if clusterAgentClient , err := getClusterAgentClient (machines , deleteMachine , portRemap ); err == nil {
602+ if err := r .removeNodeFromDqlite (ctx , clusterAgentClient , deleteMachine , portRemap ); err != nil {
603+ logger .Error (err , "failed to remove node from dqlite: %w" , "machineName" , deleteMachine .Name , "nodeName" , node .Name )
604+ }
605+ } else {
606+ logger .Error (err , "failed to get cluster agent client" )
607+ }
608+ }
609+
581610 logger .Info ("deleting machine" )
582611
583612 err = r .Client .Delete (ctx , & deleteMachine )
@@ -595,6 +624,54 @@ func (r *MicroK8sControlPlaneReconciler) scaleDownControlPlane(ctx context.Conte
595624 return ctrl.Result {Requeue : true }, nil
596625}
597626
627+ func getClusterAgentClient (machines []clusterv1.Machine , delMachine clusterv1.Machine , portRemap bool ) (* clusteragent.Client , error ) {
628+ opts := clusteragent.Options {
629+ // NOTE(hue): We want to pick a random machine's IP to call POST /dqlite/remove on its cluster agent endpoint.
630+ // This machine should preferably not be the <delMachine> itself, although this is not forced by Microk8s.
631+ IgnoreMachineNames : sets .NewString (delMachine .Name ),
632+ }
633+
634+ port := defaultClusterAgentPort
635+ if portRemap {
636+ // https://github.com/canonical/cluster-api-control-plane-provider-microk8s/blob/v0.6.10/control-plane-components.yaml#L96-L102
637+ port = "30000"
638+ }
639+
640+ clusterAgentClient , err := clusteragent .NewClient (machines , port , defaultClusterAgentClientTimeout , opts )
641+ if err != nil {
642+ return nil , fmt .Errorf ("failed to initialize cluster agent client: %w" , err )
643+ }
644+
645+ return clusterAgentClient , nil
646+ }
647+
648+ // removeMicrok8sNode removes the node from
649+ func (r * MicroK8sControlPlaneReconciler ) removeNodeFromDqlite (ctx context.Context , clusterAgentClient * clusteragent.Client , delMachine clusterv1.Machine , portRemap bool ) error {
650+ dqlitePort := defaultDqlitePort
651+ if portRemap {
652+ // https://github.com/canonical/cluster-api-control-plane-provider-microk8s/blob/v0.6.10/control-plane-components.yaml#L96-L102
653+ dqlitePort = "2379"
654+ }
655+
656+ var removeEp string
657+ for _ , addr := range delMachine .Status .Addresses {
658+ if net .ParseIP (addr .Address ) != nil {
659+ removeEp = fmt .Sprintf ("%s:%s" , addr .Address , dqlitePort )
660+ break
661+ }
662+ }
663+
664+ if removeEp == "" {
665+ return fmt .Errorf ("failed to extract endpoint of the deleting machine %q" , delMachine .Name )
666+ }
667+
668+ if err := clusterAgentClient .RemoveNodeFromDqlite (ctx , removeEp ); err != nil {
669+ return fmt .Errorf ("failed to remove node %q from dqlite: %w" , removeEp , err )
670+ }
671+
672+ return nil
673+ }
674+
598675func createUpgradePod (ctx context.Context , kubeclient * kubernetesClient , nodeName string , nodeVersion string ) (* corev1.Pod , error ) {
599676 nodeVersion = strings .TrimPrefix (semver .MajorMinor (nodeVersion ), "v" )
600677
0 commit comments