Skip to content

Commit ac3d9e3

Browse files
Call Dqlite remove endpoint for cleanup (KU-1719) (#69)
* Call node removal endpoint on machine deletion (#66) * Add pkg to Dockerfile and Fix TLS issue (#67) * Add capi-auth-token header to /dqlite/remove request (#68)
1 parent 4f3a678 commit ac3d9e3

File tree

11 files changed

+594
-1
lines changed

11 files changed

+594
-1
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ RUN go mod download
1515
COPY main.go main.go
1616
COPY api/ api/
1717
COPY controllers/ controllers/
18+
COPY pkg/ pkg/
1819

1920
# Build
2021
RUN CGO_ENABLED=0 GOOS=linux GOARCH=$arch go build -a -ldflags '-s -w' -o manager main.go

controllers/reconcile.go

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,22 @@ import (
44
"context"
55
"fmt"
66
"math/rand"
7+
"net"
78
"sort"
89
"strings"
910
"time"
1011

1112
clusterv1beta1 "github.com/canonical/cluster-api-control-plane-provider-microk8s/api/v1beta1"
13+
"github.com/canonical/cluster-api-control-plane-provider-microk8s/pkg/clusteragent"
14+
"github.com/canonical/cluster-api-control-plane-provider-microk8s/pkg/token"
1215
"golang.org/x/mod/semver"
1316

1417
"github.com/pkg/errors"
1518
corev1 "k8s.io/api/core/v1"
1619
apierrors "k8s.io/apimachinery/pkg/api/errors"
1720
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1821
kerrors "k8s.io/apimachinery/pkg/util/errors"
22+
"k8s.io/apimachinery/pkg/util/sets"
1923
"k8s.io/apiserver/pkg/storage/names"
2024
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2125
"sigs.k8s.io/cluster-api/controllers/external"
@@ -28,6 +32,12 @@ import (
2832
"sigs.k8s.io/controller-runtime/pkg/log"
2933
)
3034

35+
const (
36+
defaultClusterAgentPort string = "25000"
37+
defaultDqlitePort string = "19001"
38+
defaultClusterAgentClientTimeout time.Duration = 10 * time.Second
39+
)
40+
3141
type errServiceUnhealthy struct {
3242
service string
3343
reason string
@@ -466,7 +476,7 @@ func (r *MicroK8sControlPlaneReconciler) reconcileDelete(ctx context.Context, cl
466476
}
467477

468478
// clean up MicroK8s cluster secrets
469-
for _, secretName := range []string{"kubeconfig", "ca", "jointoken"} {
479+
for _, secretName := range []string{"kubeconfig", "ca", "jointoken", token.AuthTokenNameSuffix} {
470480
secret := &corev1.Secret{
471481
ObjectMeta: metav1.ObjectMeta{
472482
Namespace: cluster.Namespace,
@@ -578,6 +588,28 @@ func (r *MicroK8sControlPlaneReconciler) scaleDownControlPlane(ctx context.Conte
578588
node := deleteMachine.Status.NodeRef
579589

580590
logger = logger.WithValues("machineName", deleteMachine.Name, "nodeName", node.Name)
591+
592+
logger.Info("deleting node from dqlite", "machineName", deleteMachine.Name, "nodeName", node.Name)
593+
594+
// NOTE(Hue): We do this step as a best effort since this whole logic is implemented to prevent a not-yet-reported bug.
595+
// The issue is that we were not removing the endpoint from dqlite when we were deleting a machine.
596+
// This would cause a situation were a joining node failed to join because the endpoint was already in the dqlite cluster.
597+
// How? The IP assigned to the joining (new) node, previously belonged to a node that was deleted, but the IP is still there in dqlite.
598+
// If we have 2 machines, deleting one is not safe because it can be the leader and we're not taking care of
599+
// leadership transfers in the cluster-agent for now. Maybe something for later (TODO)
600+
// If we have 3 or more machines left, get cluster agent client and delete node from dqlite.
601+
if len(machines) > 2 {
602+
portRemap := tcp != nil && tcp.Spec.ControlPlaneConfig.ClusterConfiguration != nil && tcp.Spec.ControlPlaneConfig.ClusterConfiguration.PortCompatibilityRemap
603+
604+
if clusterAgentClient, err := getClusterAgentClient(machines, deleteMachine, portRemap); err == nil {
605+
if err := r.removeNodeFromDqlite(ctx, clusterAgentClient, cluster, deleteMachine, portRemap); err != nil {
606+
logger.Error(err, "failed to remove node from dqlite: %w", "machineName", deleteMachine.Name, "nodeName", node.Name)
607+
}
608+
} else {
609+
logger.Error(err, "failed to get cluster agent client")
610+
}
611+
}
612+
581613
logger.Info("deleting machine")
582614

583615
err = r.Client.Delete(ctx, &deleteMachine)
@@ -595,6 +627,60 @@ func (r *MicroK8sControlPlaneReconciler) scaleDownControlPlane(ctx context.Conte
595627
return ctrl.Result{Requeue: true}, nil
596628
}
597629

630+
func getClusterAgentClient(machines []clusterv1.Machine, delMachine clusterv1.Machine, portRemap bool) (*clusteragent.Client, error) {
631+
opts := clusteragent.Options{
632+
// NOTE(hue): We want to pick a random machine's IP to call POST /dqlite/remove on its cluster agent endpoint.
633+
// This machine should preferably not be the <delMachine> itself, although this is not forced by Microk8s.
634+
IgnoreMachineNames: sets.NewString(delMachine.Name),
635+
}
636+
637+
port := defaultClusterAgentPort
638+
if portRemap {
639+
// https://github.com/canonical/cluster-api-control-plane-provider-microk8s/blob/v0.6.10/control-plane-components.yaml#L96-L102
640+
port = "30000"
641+
}
642+
643+
clusterAgentClient, err := clusteragent.NewClient(machines, port, defaultClusterAgentClientTimeout, opts)
644+
if err != nil {
645+
return nil, fmt.Errorf("failed to initialize cluster agent client: %w", err)
646+
}
647+
648+
return clusterAgentClient, nil
649+
}
650+
651+
// removeMicrok8sNode removes the node from
652+
func (r *MicroK8sControlPlaneReconciler) removeNodeFromDqlite(ctx context.Context, clusterAgentClient *clusteragent.Client,
653+
clusterKey client.ObjectKey, delMachine clusterv1.Machine, portRemap bool) error {
654+
dqlitePort := defaultDqlitePort
655+
if portRemap {
656+
// https://github.com/canonical/cluster-api-control-plane-provider-microk8s/blob/v0.6.10/control-plane-components.yaml#L96-L102
657+
dqlitePort = "2379"
658+
}
659+
660+
var removeEp string
661+
for _, addr := range delMachine.Status.Addresses {
662+
if net.ParseIP(addr.Address) != nil {
663+
removeEp = fmt.Sprintf("%s:%s", addr.Address, dqlitePort)
664+
break
665+
}
666+
}
667+
668+
if removeEp == "" {
669+
return fmt.Errorf("failed to extract endpoint of the deleting machine %q", delMachine.Name)
670+
}
671+
672+
token, err := token.Lookup(ctx, r.Client, clusterKey)
673+
if err != nil {
674+
return fmt.Errorf("failed to lookup token: %w", err)
675+
}
676+
677+
if err := clusterAgentClient.RemoveNodeFromDqlite(ctx, token, removeEp); err != nil {
678+
return fmt.Errorf("failed to remove node %q from dqlite: %w", removeEp, err)
679+
}
680+
681+
return nil
682+
}
683+
598684
func createUpgradePod(ctx context.Context, kubeclient *kubernetesClient, nodeName string, nodeVersion string) (*corev1.Pod, error) {
599685
nodeVersion = strings.TrimPrefix(semver.MajorMinor(nodeVersion), "v")
600686

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ require (
1717
cloud.google.com/go/compute v1.10.0 // indirect
1818
github.com/blang/semver v3.5.1+incompatible // indirect
1919
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
20+
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
2021
github.com/evanphx/json-patch/v5 v5.6.0 // indirect
2122
github.com/go-openapi/jsonpointer v0.19.5 // indirect
2223
github.com/go-openapi/jsonreference v0.20.0 // indirect

pkg/clusteragent/clusteragent.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package clusteragent
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"crypto/tls"
7+
"encoding/json"
8+
"errors"
9+
"fmt"
10+
"net"
11+
"net/http"
12+
"time"
13+
14+
"k8s.io/apimachinery/pkg/util/sets"
15+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
16+
)
17+
18+
// Options should be used when initializing a new client.
19+
type Options struct {
20+
// IgnoreMachineNames is a set of ignored machine names that we don't want to pick their IP for the cluster agent endpoint.
21+
IgnoreMachineNames sets.String
22+
}
23+
24+
type Client struct {
25+
ip, port string
26+
client *http.Client
27+
}
28+
29+
// NewClient picks an IP from one of the given machines and creates a new client for the cluster agent
30+
// with that IP.
31+
func NewClient(machines []clusterv1.Machine, port string, timeout time.Duration, opts Options) (*Client, error) {
32+
var ip string
33+
for _, m := range machines {
34+
if opts.IgnoreMachineNames.Has(m.Name) {
35+
continue
36+
}
37+
38+
for _, addr := range m.Status.Addresses {
39+
if net.ParseIP(addr.Address) != nil {
40+
ip = addr.Address
41+
break
42+
}
43+
}
44+
break
45+
}
46+
47+
if ip == "" {
48+
return nil, errors.New("failed to find an IP for cluster agent")
49+
}
50+
51+
return &Client{
52+
ip: ip,
53+
port: port,
54+
client: &http.Client{
55+
Timeout: timeout,
56+
Transport: &http.Transport{
57+
TLSClientConfig: &tls.Config{
58+
// TODO(Hue): Workaround for now, address later on
59+
// get the certificate fingerprint from the matching node through a resource in the cluster (TBD),
60+
// and validate it in the TLSClientConfig
61+
InsecureSkipVerify: true,
62+
},
63+
},
64+
},
65+
}, nil
66+
}
67+
68+
func (c *Client) Endpoint() string {
69+
return fmt.Sprintf("https://%s:%s", c.ip, c.port)
70+
}
71+
72+
// do makes a request to the given endpoint with the given method. It marshals the request and unmarshals
73+
// server response body if the provided response is not nil.
74+
// The endpoint should _not_ have a leading slash.
75+
func (c *Client) do(ctx context.Context, method, endpoint string, request any, header map[string][]string, response any) error {
76+
url := fmt.Sprintf("https://%s:%s/%s", c.ip, c.port, endpoint)
77+
78+
requestBody, err := json.Marshal(request)
79+
if err != nil {
80+
return fmt.Errorf("failed to prepare worker info request: %w", err)
81+
}
82+
83+
req, err := http.NewRequestWithContext(ctx, method, url, bytes.NewBuffer(requestBody))
84+
if err != nil {
85+
return fmt.Errorf("failed to create request: %w", err)
86+
}
87+
88+
req.Header = http.Header(header)
89+
90+
res, err := c.client.Do(req)
91+
if err != nil {
92+
return fmt.Errorf("failed to call cluster agent: %w", err)
93+
}
94+
defer res.Body.Close()
95+
96+
if res.StatusCode != http.StatusOK {
97+
// NOTE(hue): Marshal and print any response that we got since it might contain valuable information
98+
// on why the request failed.
99+
// Ignore JSON errors to prevent unnecessarily complicated error handling.
100+
anyResp := make(map[string]any)
101+
_ = json.NewDecoder(res.Body).Decode(&anyResp)
102+
b, _ := json.Marshal(anyResp)
103+
resStr := string(b)
104+
105+
return fmt.Errorf("HTTP request to cluster agent failed with status code %d, got response: %q", res.StatusCode, resStr)
106+
}
107+
108+
if response != nil {
109+
if err := json.NewDecoder(res.Body).Decode(response); err != nil {
110+
return fmt.Errorf("failed to decode response: %w", err)
111+
}
112+
}
113+
114+
return nil
115+
}

0 commit comments

Comments
 (0)