Skip to content
This repository was archived by the owner on Jul 30, 2021. It is now read-only.

Commit 6f39a81

Browse files
authored
Preliminary recovery command for bootkube. (#491)
The new `bootkube recover` command provides a framework for reading control plane manifests out of a storage backend, serializing them to disk, and constructing pod manifests for the bootstrap components that are usable with `bootkube start` to re-bootstrap the control plane. The command initially provides and etcdclient backend for use with a running etcd cluster. In the future we can add support other methods, such as an etcd backup directory or a running apiserver. There is also documentation and an example script in hack/multi-node that deletes a master node, recovers the assets from etcd, and then re-bootstraps the control plane.
1 parent 6204a98 commit 6f39a81

File tree

10 files changed

+1034
-47
lines changed

10 files changed

+1034
-47
lines changed

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ If you are interested in the design and details [see the Kubernetes self-hosted
1818

1919
## Usage
2020

21-
Bootkube has two main commands: `render` and `start`
21+
Bootkube has two main commands: `render` and `start`.
22+
23+
There is a third, experimental command `recover` which can help reboot a downed cluster (see below).
2224

2325
### Render assets
2426

@@ -54,6 +56,24 @@ Example:
5456
bootkube start --asset-dir=my-cluster
5557
```
5658

59+
### Recover a downed cluster
60+
61+
In the case of a partial or total control plane outage (i.e. due to lost master nodes) an experimental `recover` command can extract and write manifests from a backup location. These manifests can then be used by the `start` command to reboot the cluster. Currently recovery from an external running etcd cluster is the only supported method.
62+
63+
To see available options, run:
64+
65+
```
66+
bootkube recover --help
67+
```
68+
69+
Example:
70+
71+
```
72+
bootkube recover --asset-dir=recovered --etcd-servers=http://127.0.0.1:2379 --kubeconfig=/etc/kubernetes/kubeconfig
73+
```
74+
75+
For a complete recovery example please see the [hack/multi-node/bootkube-test-recovery](hack/multi-node/bootkube-test-recovery) script.
76+
5777
## Building
5878

5979
First, clone the repo into the proper location in your $GOPATH:

cmd/bootkube/recover.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"crypto/tls"
6+
"crypto/x509"
7+
"errors"
8+
"fmt"
9+
"io/ioutil"
10+
"path/filepath"
11+
"strings"
12+
"time"
13+
14+
"github.com/kubernetes-incubator/bootkube/pkg/recovery"
15+
16+
"github.com/coreos/etcd/clientv3"
17+
"github.com/spf13/cobra"
18+
)
19+
20+
var (
21+
cmdRecover = &cobra.Command{
22+
Use: "recover",
23+
Short: "Recover a control plane from state stored in etcd.",
24+
Long: "",
25+
PreRunE: validateRecoverOpts,
26+
RunE: runCmdRecover,
27+
SilenceUsage: true,
28+
}
29+
30+
recoverOpts struct {
31+
assetDir string
32+
etcdCAPath string
33+
etcdCertificatePath string
34+
etcdPrivateKeyPath string
35+
etcdServers string
36+
etcdPrefix string
37+
kubeConfigPath string
38+
}
39+
)
40+
41+
func init() {
42+
cmdRoot.AddCommand(cmdRecover)
43+
cmdRecover.Flags().StringVar(&recoverOpts.assetDir, "asset-dir", "", "Output path for writing recovered cluster assets.")
44+
cmdRecover.Flags().StringVar(&recoverOpts.etcdCAPath, "etcd-ca-path", "", "Path to an existing PEM encoded CA that will be used for TLS-enabled communication between the apiserver and etcd. Must be used in conjunction with --etcd-certificate-path and --etcd-private-key-path, and must have etcd configured to use TLS with matching secrets.")
45+
cmdRecover.Flags().StringVar(&recoverOpts.etcdCertificatePath, "etcd-certificate-path", "", "Path to an existing certificate that will be used for TLS-enabled communication between the apiserver and etcd. Must be used in conjunction with --etcd-ca-path and --etcd-private-key-path, and must have etcd configured to use TLS with matching secrets.")
46+
cmdRecover.Flags().StringVar(&recoverOpts.etcdPrivateKeyPath, "etcd-private-key-path", "", "Path to an existing private key that will be used for TLS-enabled communication between the apiserver and etcd. Must be used in conjunction with --etcd-ca-path and --etcd-certificate-path, and must have etcd configured to use TLS with matching secrets.")
47+
cmdRecover.Flags().StringVar(&recoverOpts.etcdServers, "etcd-servers", "", "List of etcd servers URLs including host:port, comma separated.")
48+
cmdRecover.Flags().StringVar(&recoverOpts.etcdPrefix, "etcd-prefix", "/registry", "Path prefix to Kubernetes cluster data in etcd.")
49+
cmdRecover.Flags().StringVar(&recoverOpts.kubeConfigPath, "kubeconfig", "", "Path to kubeconfig for communicating with the cluster.")
50+
}
51+
52+
func runCmdRecover(cmd *cobra.Command, args []string) error {
53+
var err error
54+
recoverOpts.kubeConfigPath, err = filepath.Abs(recoverOpts.kubeConfigPath)
55+
if err != nil {
56+
return err
57+
}
58+
etcdClient, err := createEtcdClient()
59+
if err != nil {
60+
return err
61+
}
62+
as, err := recovery.Recover(context.Background(), recovery.NewEtcdBackend(etcdClient, recoverOpts.etcdPrefix), recoverOpts.kubeConfigPath)
63+
if err != nil {
64+
return err
65+
}
66+
return as.WriteFiles(recoverOpts.assetDir)
67+
}
68+
69+
func validateRecoverOpts(cmd *cobra.Command, args []string) error {
70+
if recoverOpts.assetDir == "" {
71+
return errors.New("missing required flag: --asset-dir")
72+
}
73+
if (recoverOpts.etcdCAPath != "" || recoverOpts.etcdCertificatePath != "" || recoverOpts.etcdPrivateKeyPath != "") && (recoverOpts.etcdCAPath == "" || recoverOpts.etcdCertificatePath == "" || recoverOpts.etcdPrivateKeyPath == "") {
74+
return errors.New("you must specify either all or none of --etcd-ca-path, --etcd-certificate-path, and --etcd-private-key-path")
75+
}
76+
if recoverOpts.etcdPrefix == "" {
77+
return errors.New("missing required flag: --etcd-prefix")
78+
}
79+
if recoverOpts.kubeConfigPath == "" {
80+
return errors.New("missing required flag: --kubeconfig")
81+
}
82+
return nil
83+
}
84+
85+
func createEtcdClient() (*clientv3.Client, error) {
86+
cfg := clientv3.Config{
87+
Endpoints: strings.Split(recoverOpts.etcdServers, ","),
88+
DialTimeout: 5 * time.Second,
89+
}
90+
if recoverOpts.etcdCAPath != "" {
91+
clientCert, err := tls.LoadX509KeyPair(recoverOpts.etcdCertificatePath, recoverOpts.etcdPrivateKeyPath)
92+
if err != nil {
93+
return nil, err
94+
}
95+
roots := x509.NewCertPool()
96+
etcdCA, err := ioutil.ReadFile(recoverOpts.etcdCAPath)
97+
if err != nil {
98+
return nil, err
99+
}
100+
if ok := roots.AppendCertsFromPEM(etcdCA); !ok {
101+
return nil, fmt.Errorf("error processing --etcd-ca-file %s", recoverOpts.etcdCAPath)
102+
}
103+
cfg.TLS = &tls.Config{
104+
Certificates: []tls.Certificate{clientCert},
105+
RootCAs: roots,
106+
}
107+
}
108+
return clientv3.New(cfg)
109+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
GLOG_v=${GLOG_v:-1}
5+
HOST=${HOST:-c1}
6+
SELF_HOST_ETCD=${SELF_HOST_ETCD:-false}
7+
8+
if [ ! -d "cluster" ]; then
9+
echo "Need some cluster assets to perform recovery; try running bootkube-up."
10+
fi
11+
12+
if [ -f cluster/bootstrap-manifests/bootstrap-etcd.yaml ]; then
13+
echo "ERROR: $(basename $0) does not currently support self-hosted etcd."
14+
exit 1
15+
fi
16+
17+
echo
18+
echo "Destroying and re-creating the master node..."
19+
echo
20+
21+
vagrant destroy -f $HOST
22+
vagrant up $HOST
23+
24+
echo
25+
echo "As you can see, the cluster is now dead:"
26+
echo
27+
28+
set -x
29+
! kubectl --kubeconfig=cluster/auth/kubeconfig get nodes
30+
{ set +x; } 2>/dev/null
31+
32+
echo
33+
echo "Recovering the control plane from etcd..."
34+
echo
35+
36+
scp -q -F ssh_config ../../_output/bin/linux/bootkube cluster/auth/kubeconfig cluster/tls/etcd-* core@$HOST:/home/core
37+
ssh -q -F ssh_config core@$HOST "GLOG_v=${GLOG_v} /home/core/bootkube recover \
38+
--asset-dir=/home/core/recovered \
39+
--etcd-ca-path=/home/core/etcd-ca.crt \
40+
--etcd-certificate-path=/home/core/etcd-client.crt \
41+
--etcd-private-key-path=/home/core/etcd-client.key \
42+
--etcd-servers=https://172.17.4.51:2379 \
43+
--kubeconfig=/home/core/kubeconfig 2>> /home/core/recovery.log"
44+
45+
echo
46+
echo "Running bootkube start..."
47+
echo
48+
49+
ssh -q -F ssh_config core@$HOST "sudo GLOG_v=${GLOG_v} /home/core/bootkube start --asset-dir=/home/core/recovered 2>> /home/core/recovery.log"
50+
51+
echo
52+
echo "The cluster should now be recovered. You should be able to access the cluster again using:"
53+
echo "kubectl --kubeconfig=cluster/auth/kubeconfig get nodes"
54+
echo

pkg/bootkube/bootkube.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,21 @@ func NewBootkube(config Config) (*bootkube, error) {
4141
}
4242

4343
func (b *bootkube) Run() error {
44-
defer func() {
45-
// Always clean up the bootstrap control plane and secrets.
46-
if err := CleanupBootstrapControlPlane(b.assetDir, b.podManifestPath); err != nil {
47-
UserOutput("Error cleaning up temporary bootstrap control plane: %v\n", err)
48-
}
49-
}()
50-
5144
// TODO(diegs): create and share a single client rather than the kubeconfig once all uses of it
5245
// are migrated to client-go.
5346
kubeConfig = clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
5447
&clientcmd.ClientConfigLoadingRules{ExplicitPath: filepath.Join(b.assetDir, asset.AssetPathKubeConfig)},
5548
&clientcmd.ConfigOverrides{})
5649

50+
bcp := NewBootstrapControlPlane(b.assetDir, b.podManifestPath)
51+
52+
defer func() {
53+
// Always tear down the bootstrap control plane and clean up manifests and secrets.
54+
if err := bcp.Teardown(); err != nil {
55+
UserOutput("Error tearing down temporary bootstrap control plane: %v\n", err)
56+
}
57+
}()
58+
5759
var err error
5860
defer func() {
5961
// Always report errors.
@@ -62,7 +64,7 @@ func (b *bootkube) Run() error {
6264
}
6365
}()
6466

65-
if err = CreateBootstrapControlPlane(b.assetDir, b.podManifestPath); err != nil {
67+
if err = bcp.Start(); err != nil {
6668
return err
6769
}
6870

pkg/bootkube/bootstrap.go

Lines changed: 56 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,68 +5,62 @@ import (
55
"io/ioutil"
66
"os"
77
"path/filepath"
8+
"strings"
89

910
"github.com/kubernetes-incubator/bootkube/pkg/asset"
1011
)
1112

12-
// CreateBootstrapControlPlane seeds static manifests to the kubelet to launch the bootstrap control
13-
// plane.
14-
func CreateBootstrapControlPlane(assetDir string, podManifestPath string) error {
15-
UserOutput("Running temporary bootstrap control plane...\n")
13+
type bootstrapControlPlane struct {
14+
assetDir string
15+
podManifestPath string
16+
ownedManifests []string
17+
}
18+
19+
// NewBootstrapControlPlane constructs a new bootstrap control plane object.
20+
func NewBootstrapControlPlane(assetDir, podManifestPath string) *bootstrapControlPlane {
21+
return &bootstrapControlPlane{
22+
assetDir: assetDir,
23+
podManifestPath: podManifestPath,
24+
}
25+
}
1626

27+
// Start seeds static manifests to the kubelet to launch the bootstrap control plane.
28+
// Users should always ensure that Cleanup() is called even in the case of errors.
29+
func (b *bootstrapControlPlane) Start() error {
30+
UserOutput("Starting temporary bootstrap control plane...\n")
1731
// Make secrets temporarily available to bootstrap cluster.
1832
if err := os.RemoveAll(asset.BootstrapSecretsDir); err != nil {
1933
return err
2034
}
21-
if err := os.Mkdir(asset.BootstrapSecretsDir, os.FileMode(0700)); err != nil {
35+
secretsDir := filepath.Join(b.assetDir, asset.AssetPathSecrets)
36+
if _, err := copyDirectory(secretsDir, asset.BootstrapSecretsDir, true /* overwrite */); err != nil {
2237
return err
2338
}
24-
secretsDir := filepath.Join(assetDir, asset.AssetPathSecrets)
25-
secrets, err := ioutil.ReadDir(secretsDir)
26-
if err != nil {
27-
return err
28-
}
29-
for _, secret := range secrets {
30-
if err := copyFile(filepath.Join(secretsDir, secret.Name()), filepath.Join(asset.BootstrapSecretsDir, secret.Name()), true); err != nil {
31-
return err
32-
}
33-
}
34-
3539
// Copy the static manifests to the kubelet's pod manifest path.
36-
manifestsDir := filepath.Join(assetDir, asset.AssetPathBootstrapManifests)
37-
manifests, err := ioutil.ReadDir(manifestsDir)
38-
if err != nil {
39-
return err
40-
}
41-
for _, manifest := range manifests {
42-
if err := copyFile(filepath.Join(manifestsDir, manifest.Name()), filepath.Join(podManifestPath, manifest.Name()), false); err != nil {
43-
return err
44-
}
45-
}
46-
47-
return nil
40+
manifestsDir := filepath.Join(b.assetDir, asset.AssetPathBootstrapManifests)
41+
ownedManifests, err := copyDirectory(manifestsDir, b.podManifestPath, false /* overwrite */)
42+
b.ownedManifests = ownedManifests // always copy in case of partial failure.
43+
return err
4844
}
4945

50-
// CleanupBootstrapControlPlane brings down the bootstrap control plane and cleans up the temporary
46+
// Teardown brings down the bootstrap control plane and cleans up the temporary manifests and
5147
// secrets. This function is idempotent.
52-
func CleanupBootstrapControlPlane(assetDir string, podManifestPath string) error {
53-
UserOutput("Cleaning up temporary bootstrap control plane...\n")
54-
48+
func (b *bootstrapControlPlane) Teardown() error {
49+
UserOutput("Tearing down temporary bootstrap control plane...\n")
5550
if err := os.RemoveAll(asset.BootstrapSecretsDir); err != nil {
5651
return err
5752
}
58-
manifests, err := ioutil.ReadDir(filepath.Join(assetDir, asset.AssetPathBootstrapManifests))
59-
if err != nil {
60-
return err
61-
}
62-
for _, manifest := range manifests {
63-
if err := os.Remove(filepath.Join(podManifestPath, manifest.Name())); err != nil && !os.IsNotExist(err) {
53+
for _, manifest := range b.ownedManifests {
54+
if err := os.Remove(manifest); err != nil && !os.IsNotExist(err) {
6455
return err
6556
}
6657
}
58+
b.ownedManifests = nil
6759
return nil
6860
}
6961

62+
// copyFile copies a single file from src to dst. Returns an error if overwrite is true and dst
63+
// exists, or if any I/O error occurs during copying.
7064
func copyFile(src, dst string, overwrite bool) error {
7165
if !overwrite {
7266
fi, err := os.Stat(dst)
@@ -83,3 +77,27 @@ func copyFile(src, dst string, overwrite bool) error {
8377
}
8478
return ioutil.WriteFile(dst, data, os.FileMode(0600))
8579
}
80+
81+
// copyDirectory copies srcDir to dstDir recursively. It returns the paths of files (not
82+
// directories) that were copied.
83+
func copyDirectory(srcDir, dstDir string, overwrite bool) ([]string, error) {
84+
var copied []string
85+
return copied, filepath.Walk(srcDir, func(src string, info os.FileInfo, err error) error {
86+
if err != nil {
87+
return err
88+
}
89+
dst := filepath.Join(dstDir, strings.TrimPrefix(src, srcDir))
90+
if info.IsDir() {
91+
err = os.Mkdir(dst, os.FileMode(0700))
92+
if os.IsExist(err) {
93+
err = nil
94+
}
95+
return err
96+
}
97+
if err := copyFile(src, dst, overwrite); err != nil {
98+
return err
99+
}
100+
copied = append(copied, dst)
101+
return nil
102+
})
103+
}

pkg/bootkube/create.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package bootkube
22

33
import (
44
"fmt"
5+
"os"
56
"strings"
67
"time"
78

@@ -19,6 +20,11 @@ import (
1920
)
2021

2122
func CreateAssets(manifestDir string, timeout time.Duration) error {
23+
if _, err := os.Stat(manifestDir); os.IsNotExist(err) {
24+
UserOutput(fmt.Sprintf("WARNING: %v does not exist, not creating any self-hosted assets.\n", manifestDir))
25+
return nil
26+
}
27+
2228
upFn := func() (bool, error) {
2329
if err := apiTest(); err != nil {
2430
glog.Warningf("Unable to determine api-server readiness: %v", err)

0 commit comments

Comments
 (0)