Skip to content

Commit bfe38e3

Browse files
feat: reset command (#276)
* reset command * fmt * delete controllnode * always drain node * swap defer order * Update cmd/embedded-cluster/reset.go Co-authored-by: Andrew Lavery <[email protected]> * refactor * format * actually work * add prompt to continue anyway * also reset single nodes * always warn to reboot * prompt to reset * basic quorum safety warnings * mod * Update cmd/embedded-cluster/reset.go Co-authored-by: Andrew Lavery <[email protected]> --------- Co-authored-by: Andrew Lavery <[email protected]>
1 parent 1c46797 commit bfe38e3

File tree

3 files changed

+301
-1
lines changed

3 files changed

+301
-1
lines changed

cmd/embedded-cluster/node.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ var nodeCommands = &cli.Command{
1818
nodeStartCommand,
1919
nodeListCommand,
2020
joinCommand,
21+
resetCommand,
2122
},
2223
}
2324

cmd/embedded-cluster/reset.go

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"os"
8+
"os/exec"
9+
10+
autopilot "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2"
11+
"github.com/urfave/cli/v2"
12+
corev1 "k8s.io/api/core/v1"
13+
"sigs.k8s.io/controller-runtime/pkg/client"
14+
15+
"k8s.io/client-go/tools/clientcmd"
16+
17+
"github.com/replicatedhq/embedded-cluster/pkg/defaults"
18+
"github.com/replicatedhq/embedded-cluster/pkg/prompts"
19+
)
20+
21+
type etcdMembers struct {
22+
Members map[string]string `json:"members"`
23+
}
24+
25+
type hostInfo struct {
26+
Hostname string
27+
Kclient client.Client
28+
Node corev1.Node
29+
ControlNode autopilot.ControlNode
30+
}
31+
32+
var (
33+
binName = defaults.BinaryName()
34+
k0s = defaults.K0sBinaryPath()
35+
)
36+
37+
// drainNode uses k0s to initiate a node drain
38+
func (h *hostInfo) drainNode() error {
39+
os.Unsetenv("KUBECONFIG")
40+
drainArgList := []string{
41+
"kubectl",
42+
"drain",
43+
"--ignore-daemonsets",
44+
"--delete-emptydir-data",
45+
h.Hostname,
46+
}
47+
out, err := exec.Command(k0s, drainArgList...).CombinedOutput()
48+
if err != nil {
49+
return fmt.Errorf("could not drain node: %w, %s", err, out)
50+
}
51+
return nil
52+
}
53+
54+
// configureKubernetesClient sets up a client to use for kubernetes api calls
55+
func (h *hostInfo) configureKubernetesClient() error {
56+
adminConfig, err := exec.Command(k0s, "kubeconfig", "admin").Output()
57+
if err != nil {
58+
return err
59+
}
60+
restConfig, err := clientcmd.RESTConfigFromKubeConfig(adminConfig)
61+
if err != nil {
62+
return err
63+
}
64+
h.Kclient, err = client.New(restConfig, client.Options{})
65+
autopilot.AddToScheme(h.Kclient.Scheme())
66+
if err != nil {
67+
return fmt.Errorf("couldn't create k8s config: %w", err)
68+
}
69+
return nil
70+
}
71+
72+
// getHostName fetches the hostname for the node
73+
func (h *hostInfo) getHostName() error {
74+
hostname, err := os.Hostname()
75+
if err != nil {
76+
return nil
77+
}
78+
h.Hostname = hostname
79+
return nil
80+
}
81+
82+
// isControlPlane attempts to determine if the node is a controlplane node
83+
func (h *hostInfo) isControlPlane() bool {
84+
labels := h.Node.GetLabels()
85+
return labels["node-role.kubernetes.io/control-plane"] == "true"
86+
}
87+
88+
// getNodeObject fetches the node object from the k8s api server
89+
func (h *hostInfo) getNodeObject(ctx context.Context) error {
90+
err := h.Kclient.Get(ctx, client.ObjectKey{Name: h.Hostname}, &h.Node)
91+
if err != nil {
92+
return err
93+
}
94+
return nil
95+
}
96+
97+
// getControlNodeObject fetches the controlNode object from the k8s api server
98+
func (h *hostInfo) getControlNodeObject(ctx context.Context) error {
99+
err := h.Kclient.Get(ctx, client.ObjectKey{Name: h.Hostname}, &h.ControlNode)
100+
if err != nil {
101+
return err
102+
}
103+
return nil
104+
}
105+
106+
func (h *hostInfo) checkQuorumSafety(c *cli.Context) (bool, string, error) {
107+
if c.Bool("yes-really-reset") {
108+
return true, "", nil
109+
}
110+
out, err := exec.Command(k0s, "etcd", "member-list").Output()
111+
if err != nil {
112+
return false, "", fmt.Errorf("unable to fetch etcd member list, %w, %s", err, out)
113+
}
114+
etcd := etcdMembers{}
115+
err = json.Unmarshal(out, &etcd)
116+
if err != nil {
117+
return false, "", fmt.Errorf("unable to unmarshal etcd member list, %w, %s", err, out)
118+
}
119+
if len(etcd.Members) < 3 {
120+
return true, "", nil
121+
}
122+
if len(etcd.Members) == 3 {
123+
return false, "cluster has 3 control-plane nodes, removing this node will cause etcd to lose quorum", nil
124+
}
125+
if len(etcd.Members)%2 != 0 {
126+
return false, "cluster would have even number of control-plane nodes after resetting this node, this could cause etcd to become unstable", nil
127+
}
128+
return true, "", nil
129+
}
130+
131+
// leaveEtcdcluster uses k0s to attempt to leave the etcd cluster
132+
func (h *hostInfo) leaveEtcdcluster() error {
133+
out, err := exec.Command(k0s, "etcd", "leave").CombinedOutput()
134+
if err != nil {
135+
return fmt.Errorf("unable to leave etcd cluster: %w, %s", err, string(out))
136+
}
137+
return nil
138+
}
139+
140+
// stopK0s attempts to stop the k0s service
141+
func (h *hostInfo) stopAndResetK0s() error {
142+
out, err := exec.Command(k0s, "stop").CombinedOutput()
143+
if err != nil {
144+
return fmt.Errorf("could not stop k0s service: %w, %s", err, string(out))
145+
}
146+
out, err = exec.Command(k0s, "reset").CombinedOutput()
147+
if err != nil {
148+
return fmt.Errorf("could not reset k0s: %w, %s", err, string(out))
149+
}
150+
fmt.Println("Node has been reset, please reboot to ensure transient configuration is also reset")
151+
return nil
152+
}
153+
154+
// newHostInfo returns a populated hostInfo struct
155+
func newHostInfo(ctx context.Context) (hostInfo, error) {
156+
currentHost := hostInfo{}
157+
// populate hostname
158+
err := currentHost.getHostName()
159+
if err != nil {
160+
return currentHost, err
161+
}
162+
// set up kube client
163+
err = currentHost.configureKubernetesClient()
164+
if err != nil {
165+
return currentHost, err
166+
}
167+
// fetch node object
168+
err = currentHost.getNodeObject(ctx)
169+
if err != nil {
170+
return currentHost, err
171+
}
172+
// control plane only stff
173+
if currentHost.isControlPlane() {
174+
// fetch controlNode
175+
err := currentHost.getControlNodeObject(ctx)
176+
if err != nil {
177+
return currentHost, err
178+
}
179+
}
180+
return currentHost, nil
181+
}
182+
183+
func checkErrPrompt(err error) bool {
184+
if err == nil {
185+
return true
186+
}
187+
fmt.Println("-----")
188+
fmt.Println(err)
189+
fmt.Println("-----")
190+
fmt.Println("An error has occured while trying to reset this node.")
191+
fmt.Println("Continuing may leave the cluster in an unexpected state")
192+
return prompts.New().Confirm("Do you want to continue anyway?", false)
193+
}
194+
195+
var resetCommand = &cli.Command{
196+
Name: "reset",
197+
Flags: []cli.Flag{
198+
&cli.BoolFlag{
199+
Name: "yes-really-reset",
200+
Hidden: true,
201+
Value: false,
202+
},
203+
},
204+
Usage: "Reset the node this command is run from",
205+
Action: func(c *cli.Context) error {
206+
207+
fmt.Println("This command will completely reset this node, removing it from the cluster")
208+
if !prompts.New().Confirm("Do you want to continue?", false) {
209+
fmt.Println("aborting.")
210+
return nil
211+
}
212+
213+
fmt.Println("gathering facts...")
214+
// populate options struct with host information
215+
currentHost, err := newHostInfo(c.Context)
216+
if err != nil {
217+
fmt.Println(err)
218+
return nil
219+
}
220+
221+
// basic check to see if it's safe to remove this node from the cluster
222+
if currentHost.isControlPlane() {
223+
safeToRemove, reason, err := currentHost.checkQuorumSafety(c)
224+
if err != nil {
225+
fmt.Println(err)
226+
return nil
227+
}
228+
if !safeToRemove {
229+
fmt.Println(reason)
230+
fmt.Println("run reset command again with --yes-really-reset to ignore this")
231+
return nil
232+
}
233+
}
234+
235+
// determine if this is the only node in the cluster
236+
// if this is a single node we can skip a lot of steps
237+
nodeList := corev1.NodeList{}
238+
currentHost.Kclient.List(c.Context, &nodeList)
239+
if len(nodeList.Items) == 1 {
240+
nodeName := nodeList.Items[0].Name
241+
if nodeName != currentHost.Hostname {
242+
fmt.Println("detected a single node cluster, but the node's name doesn't match our hostname")
243+
return nil
244+
}
245+
// stop k0s
246+
fmt.Printf("resetting %s...\n", binName)
247+
err = currentHost.stopAndResetK0s()
248+
if !checkErrPrompt(err) {
249+
return nil
250+
}
251+
return nil
252+
}
253+
254+
// drain node
255+
fmt.Println("draining node...")
256+
err = currentHost.drainNode()
257+
if !checkErrPrompt(err) {
258+
return nil
259+
}
260+
261+
// remove node from cluster
262+
fmt.Println("removing node from cluster...")
263+
err = currentHost.Kclient.Delete(c.Context, &currentHost.Node)
264+
if !checkErrPrompt(err) {
265+
return nil
266+
}
267+
268+
// controller pre-reset
269+
if currentHost.isControlPlane() {
270+
271+
// delete controlNode object from cluster
272+
fmt.Println("deleting controlNode...")
273+
err := currentHost.Kclient.Delete(c.Context, &currentHost.ControlNode)
274+
if !checkErrPrompt(err) {
275+
return nil
276+
}
277+
278+
// try and leave etcd cluster
279+
fmt.Println("leaving etcd cluster...")
280+
err = currentHost.leaveEtcdcluster()
281+
if !checkErrPrompt(err) {
282+
return nil
283+
}
284+
285+
} else if err != nil {
286+
fmt.Println(err)
287+
return nil
288+
}
289+
290+
// reset
291+
fmt.Printf("resetting %s...\n", binName)
292+
err = currentHost.stopAndResetK0s()
293+
if !checkErrPrompt(err) {
294+
return nil
295+
}
296+
297+
return nil
298+
},
299+
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ require (
157157
gopkg.in/square/go-jose.v2 v2.6.0 // indirect
158158
gopkg.in/yaml.v3 v3.0.1
159159
k8s.io/apiextensions-apiserver v0.29.0 // indirect
160-
k8s.io/client-go v0.29.0 // indirect
160+
k8s.io/client-go v0.29.0
161161
k8s.io/klog/v2 v2.110.1 // indirect
162162
k8s.io/kube-openapi v0.0.0-20231113174909-778a5567bc1e // indirect
163163
k8s.io/utils v0.0.0-20231121161247-cf03d44ff3cf // indirect

0 commit comments

Comments
 (0)