Skip to content
This repository was archived by the owner on Jul 30, 2021. It is now read-only.

Commit 775e3ea

Browse files
authored
Merge pull request #528 from xiang90/t
*: add backup file option to recover subcommand
2 parents 40a7f46 + 56bc9d5 commit 775e3ea

File tree

9 files changed

+488
-11
lines changed

9 files changed

+488
-11
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ bootkube start --asset-dir=my-cluster
5858

5959
### Recover a downed cluster
6060

61-
In the case of a partial or total control plane outage (i.e. due to lost master nodes) an experimental `recover` command can extract and write manifests from a backup location. These manifests can then be used by the `start` command to reboot the cluster. Currently recovery from a running apiserver or external running etcd cluster are the only supported methods.
61+
In the case of a partial or total control plane outage (i.e. due to lost master nodes) an experimental `recover` command can extract and write manifests from a backup location. These manifests can then be used by the `start` command to reboot the cluster. Currently recovery from a running apiserver, an external running etcd cluster, or an etcd backup taken from the self hosted etcd cluster are the methods.
6262

6363
To see available options, run:
6464

@@ -78,7 +78,13 @@ Recover from a running apiserver (i.e. if the scheduler pods are all down):
7878
bootkube recover --asset-dir=recovered --kubeconfig=/etc/kubernetes/kubeconfig
7979
```
8080

81-
For a complete recovery example please see the [hack/multi-node/bootkube-test-recovery](hack/multi-node/bootkube-test-recovery) script.
81+
Recover from an etcd backup when self hosted etcd is enabled:
82+
83+
```
84+
bootkube recover --asset-dir=recovered --etcd-backup-file=backup --kubeconfig=/etc/kubernetes/kubeconfig
85+
```
86+
87+
For a complete recovery example please see the [hack/multi-node/bootkube-test-recovery](hack/multi-node/bootkube-test-recovery) and the [hack/multi-node/bootkube-test-recovery-self-hosted-etcd](hack/multi-node/bootkube-test-recovery-self-hosted-etcd) scripts.
8288

8389
## Building
8490

cmd/bootkube/recover.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"github.com/kubernetes-incubator/bootkube/pkg/bootkube"
1515
"github.com/kubernetes-incubator/bootkube/pkg/recovery"
16+
"github.com/kubernetes-incubator/bootkube/pkg/util/etcdutil"
1617

1718
"github.com/coreos/etcd/clientv3"
1819
"github.com/spf13/cobra"
@@ -36,6 +37,8 @@ var (
3637
etcdServers string
3738
etcdPrefix string
3839
kubeConfigPath string
40+
podManifestPath string
41+
etcdBackupFile string
3942
}
4043
)
4144

@@ -46,8 +49,10 @@ func init() {
4649
cmdRecover.Flags().StringVar(&recoverOpts.etcdCertificatePath, "etcd-certificate-path", "", "Path to an existing certificate that will be used for TLS-enabled communication between the apiserver and etcd. Must be used in conjunction with --etcd-ca-path and --etcd-private-key-path, and must have etcd configured to use TLS with matching secrets.")
4750
cmdRecover.Flags().StringVar(&recoverOpts.etcdPrivateKeyPath, "etcd-private-key-path", "", "Path to an existing private key that will be used for TLS-enabled communication between the apiserver and etcd. Must be used in conjunction with --etcd-ca-path and --etcd-certificate-path, and must have etcd configured to use TLS with matching secrets.")
4851
cmdRecover.Flags().StringVar(&recoverOpts.etcdServers, "etcd-servers", "", "List of etcd server URLs including host:port, comma separated.")
52+
cmdRecover.Flags().StringVar(&recoverOpts.etcdBackupFile, "etcd-backup-file", "", "Path to the etcd backup file.")
4953
cmdRecover.Flags().StringVar(&recoverOpts.etcdPrefix, "etcd-prefix", "/registry", "Path prefix to Kubernetes cluster data in etcd.")
5054
cmdRecover.Flags().StringVar(&recoverOpts.kubeConfigPath, "kubeconfig", "", "Path to kubeconfig for communicating with the cluster.")
55+
cmdRecover.Flags().StringVar(&recoverOpts.podManifestPath, "pod-manifest-path", "/etc/kubernetes/manifests", "The location where the kubelet is configured to look for static pod manifests. (Only need to be set when recovering from a etcd backup file)")
5156
}
5257

5358
func runCmdRecover(cmd *cobra.Command, args []string) error {
@@ -66,6 +71,34 @@ func runCmdRecover(cmd *cobra.Command, args []string) error {
6671
return err
6772
}
6873
backend = recovery.NewEtcdBackend(etcdClient, recoverOpts.etcdPrefix)
74+
case recoverOpts.etcdBackupFile != "":
75+
bootkube.UserOutput("Attempting recovery using etcd backup file at %q...\n", recoverOpts.etcdBackupFile)
76+
77+
err = recovery.StartRecoveryEtcdForBackup(recoverOpts.podManifestPath, recoverOpts.etcdBackupFile)
78+
if err != nil {
79+
return err
80+
}
81+
defer func() {
82+
err = recovery.CleanRecoveryEtcd(recoverOpts.podManifestPath)
83+
if err != nil {
84+
bootkube.UserOutput("Failed to cleanup recovery etcd from the podManifestPath %v\n", recoverOpts.podManifestPath)
85+
}
86+
}()
87+
88+
bootkube.UserOutput("Waiting for etcd server to start...\n")
89+
90+
err = etcdutil.WaitClusterReady(recovery.RecoveryEtcdClientAddr)
91+
if err != nil {
92+
return err
93+
}
94+
95+
recoverOpts.etcdServers = recovery.RecoveryEtcdClientAddr
96+
etcdClient, err := createEtcdClient()
97+
if err != nil {
98+
return err
99+
}
100+
backend = recovery.NewSelfHostedEtcdBackend(etcdClient, recoverOpts.etcdPrefix, recoverOpts.etcdBackupFile)
101+
69102
default:
70103
bootkube.UserOutput("Attempting recovery using apiserver at %q...\n", recoverOpts.kubeConfigPath)
71104
backend, err = recovery.NewAPIServerBackend(recoverOpts.kubeConfigPath)
@@ -94,6 +127,9 @@ func validateRecoverOpts(cmd *cobra.Command, args []string) error {
94127
if recoverOpts.kubeConfigPath == "" {
95128
return errors.New("missing required flag: --kubeconfig")
96129
}
130+
if recoverOpts.etcdBackupFile != "" && recoverOpts.podManifestPath == "" {
131+
return errors.New("missing required flag: --pod-manifest-path (--etcd-backup-file flag is specified)")
132+
}
97133
return nil
98134
}
99135

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
GLOG_v=${GLOG_v:-1}
5+
HOST=${HOST:-c1}
6+
7+
if [ ! -d "cluster" ]; then
8+
echo "Need some cluster assets to perform recovery; try running bootkube-up."
9+
fi
10+
11+
if [ ! -f cluster/bootstrap-manifests/bootstrap-etcd.yaml ]; then
12+
echo "ERROR: run bootkube-test-recovery for non self-hosted etcd Kubernetes cluster."
13+
exit 1
14+
fi
15+
16+
echo "Getting the etcd backup file"
17+
echo
18+
19+
ssh -q -F ssh_config core@$HOST "sudo cp /var/etcd/kube-system-kube-etcd-0000/member/snap/db /home/core/etcdbackup"
20+
ssh -q -F ssh_config core@$HOST "sudo chown core:core /home/core/etcdbackup"
21+
scp -q -F ssh_config core@$HOST:/home/core/etcdbackup cluster/etcdbackup
22+
23+
echo
24+
echo "Destroying and re-creating the master node..."
25+
echo
26+
27+
vagrant destroy -f $HOST
28+
vagrant up $HOST
29+
30+
echo
31+
echo "As you can see, the cluster is now dead:"
32+
echo
33+
34+
set -x
35+
! kubectl --kubeconfig=cluster/auth/kubeconfig get nodes
36+
{ set +x; } 2>/dev/null
37+
38+
echo
39+
echo "Recovering the control plane from the etcd backup..."
40+
echo
41+
42+
scp -q -F ssh_config ../../_output/bin/linux/bootkube cluster/auth/kubeconfig cluster/etcdbackup core@$HOST:/home/core
43+
ssh -q -F ssh_config core@$HOST "sudo GLOG_v=${GLOG_v} /home/core/bootkube recover \
44+
--asset-dir=/home/core/recovered \
45+
--etcd-backup-file=/home/core/etcdbackup \
46+
--kubeconfig=/home/core/kubeconfig 2>> /home/core/recovery.log"
47+
48+
echo
49+
echo "Running bootkube start..."
50+
echo
51+
52+
ssh -q -F ssh_config core@$HOST "sudo GLOG_v=${GLOG_v} /home/core/bootkube start --asset-dir=/home/core/recovered 2>> /home/core/recovery.log"
53+
54+
echo
55+
echo "The cluster should now be recovered. You should be able to access the cluster again using:"
56+
echo "kubectl --kubeconfig=cluster/auth/kubeconfig get nodes"

pkg/recovery/etcd.go

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,22 @@ package recovery
55

66
import (
77
"context"
8+
"encoding/json"
89
"fmt"
10+
"net/url"
11+
"os"
912
"path"
1013
"strings"
1114

15+
"github.com/kubernetes-incubator/bootkube/pkg/asset"
16+
17+
"github.com/coreos/etcd-operator/pkg/spec"
1218
"github.com/coreos/etcd/clientv3"
19+
"github.com/pborman/uuid"
1320
"k8s.io/apimachinery/pkg/api/meta"
1421
"k8s.io/apimachinery/pkg/runtime"
1522
"k8s.io/client-go/pkg/api"
23+
"k8s.io/client-go/pkg/apis/extensions/v1beta1"
1624
)
1725

1826
// etcdBackend is a backend that extracts a controlPlane from an etcd instance.
@@ -54,6 +62,7 @@ func (s *etcdBackend) read(ctx context.Context) (*controlPlane, error) {
5462
return nil, err
5563
}
5664
}
65+
5766
return cp, nil
5867
}
5968

@@ -75,6 +84,19 @@ func (s *etcdBackend) get(ctx context.Context, key string, out runtime.Object, i
7584
return decode(s.decoder, kv.Value, out)
7685
}
7786

87+
func (s *etcdBackend) getBytes(ctx context.Context, key string) ([]byte, error) {
88+
key = path.Join(s.pathPrefix, key)
89+
getResp, err := s.client.KV.Get(ctx, key)
90+
if err != nil {
91+
return nil, err
92+
}
93+
94+
if len(getResp.Kvs) == 0 {
95+
return nil, fmt.Errorf("key not found: %s", key)
96+
}
97+
return getResp.Kvs[0].Value, nil
98+
}
99+
78100
// list fetches a list runtime.Object from etcd located at key prefix `key`.
79101
func (s *etcdBackend) list(ctx context.Context, key string, listObj runtime.Object) error {
80102
listPtr, err := meta.GetItemsPtr(listObj)
@@ -96,3 +118,174 @@ func (s *etcdBackend) list(ctx context.Context, key string, listObj runtime.Obje
96118
}
97119
return decodeList(elems, listPtr, s.decoder)
98120
}
121+
122+
const (
123+
assetPathRecoveryEtcd = "recovery-etcd.yaml"
124+
etcdTPRKey = "ThirdPartyResourceData/etcd.coreos.com/clusters/kube-system/kube-etcd"
125+
etcdMemberPodPrefix = "pods/kube-system/kube-etcd-"
126+
RecoveryEtcdClientAddr = "http://localhost:52379"
127+
)
128+
129+
type etcdSelfhostedBackend struct {
130+
*etcdBackend
131+
132+
backupPath string
133+
}
134+
135+
// NewSelfHostedEtcdBackend constructs a new etcdBackend for the given client and pathPrefix, and backup file.
136+
func NewSelfHostedEtcdBackend(client *clientv3.Client, pathPrefix, backupPath string) Backend {
137+
eb := &etcdBackend{
138+
client: client,
139+
decoder: api.Codecs.UniversalDecoder(),
140+
pathPrefix: pathPrefix,
141+
}
142+
143+
return &etcdSelfhostedBackend{
144+
etcdBackend: eb,
145+
backupPath: backupPath,
146+
}
147+
}
148+
149+
// read implements Backend.read().
150+
func (s *etcdSelfhostedBackend) read(ctx context.Context) (*controlPlane, error) {
151+
cp, err := s.etcdBackend.read(ctx)
152+
if err != nil {
153+
return nil, err
154+
}
155+
156+
d, err := s.getBytes(ctx, etcdTPRKey)
157+
if err != nil {
158+
return nil, err
159+
}
160+
161+
var tpr v1beta1.ThirdPartyResourceData
162+
err = decode(s.decoder, d, &tpr)
163+
if err != nil {
164+
return nil, err
165+
}
166+
167+
var kubeetcd spec.Cluster
168+
err = json.Unmarshal(tpr.Data, &kubeetcd)
169+
if err != nil {
170+
return nil, err
171+
}
172+
173+
etpr, err := createEtcdTPRAsset(kubeetcd)
174+
if err != nil {
175+
return nil, err
176+
}
177+
cp.tpr = etpr
178+
179+
serviceIP, err := getServiceIPFromClusterSpec(kubeetcd.Spec)
180+
if err != nil {
181+
return nil, err
182+
}
183+
eas := createBootEtcdAsset(s.pathPrefix, s.backupPath, serviceIP)
184+
esas := createBootEtcdServiceAsset(serviceIP)
185+
cp.bootEtcd = &eas
186+
cp.bootEtcdService = &esas
187+
188+
return cp, nil
189+
}
190+
191+
// StartRecoveryEtcdForBackup starts a recovery etcd container using given backup.
192+
// The started etcd server listens on RecoveryEtcdClientAddr.
193+
func StartRecoveryEtcdForBackup(p, backupPath string) error {
194+
d, f := path.Split(backupPath)
195+
196+
config := struct {
197+
Image string
198+
BackupFile string
199+
BackupDir string
200+
ClientAddr string
201+
}{
202+
Image: asset.DefaultImages.Etcd,
203+
BackupFile: f,
204+
BackupDir: d,
205+
ClientAddr: RecoveryEtcdClientAddr,
206+
}
207+
208+
as := asset.MustCreateAssetFromTemplate(assetPathRecoveryEtcd, recoveryEtcdTemplate, config)
209+
return as.WriteFile(p)
210+
}
211+
212+
// CleanRecoveryEtcd removes the recovery etcd static pod manifest and stops the recovery
213+
// etcd container.
214+
func CleanRecoveryEtcd(p string) error {
215+
return os.Remove(path.Join(p, assetPathRecoveryEtcd))
216+
}
217+
218+
func createBootEtcdAsset(pathPrefix, backupPath, serviceIP string) asset.Asset {
219+
d, f := path.Split(backupPath)
220+
221+
config := struct {
222+
Image string
223+
BackupFile string
224+
BackupDir string
225+
BootEtcdServiceIP string
226+
TPRKey string
227+
MemberPodPrefix string
228+
ClusterToken string
229+
}{
230+
Image: asset.DefaultImages.Etcd,
231+
BackupFile: f,
232+
BackupDir: d,
233+
BootEtcdServiceIP: serviceIP,
234+
TPRKey: path.Join(pathPrefix, etcdTPRKey),
235+
MemberPodPrefix: path.Join(pathPrefix, etcdMemberPodPrefix),
236+
ClusterToken: "bootkube-recovery-" + uuid.New(),
237+
}
238+
239+
return asset.MustCreateAssetFromTemplate(asset.AssetPathBootstrapEtcd, bootFromBackupEtcdTemplate, config)
240+
}
241+
242+
func createBootEtcdServiceAsset(serviceIP string) asset.Asset {
243+
config := struct{ BootEtcdServiceIP string }{BootEtcdServiceIP: serviceIP}
244+
245+
return asset.MustCreateAssetFromTemplate(asset.AssetPathBootstrapEtcdService, recoveryEtcdSvcTemplate, config)
246+
}
247+
248+
func createEtcdTPRAsset(s spec.Cluster) (*asset.Asset, error) {
249+
clone := cloneEtcdClusterTPR(s)
250+
251+
data, err := json.Marshal(clone)
252+
if err != nil {
253+
return nil, err
254+
}
255+
256+
return &asset.Asset{
257+
Name: asset.AssetPathMigrateEtcdCluster,
258+
Data: data,
259+
}, nil
260+
}
261+
262+
func getServiceIPFromClusterSpec(s spec.ClusterSpec) (string, error) {
263+
ep := s.SelfHosted.BootMemberClientEndpoint
264+
u, err := url.Parse(ep)
265+
if err != nil {
266+
return "", err
267+
}
268+
return stripPort(u.Host), nil
269+
}
270+
271+
func cloneEtcdClusterTPR(s spec.Cluster) spec.Cluster {
272+
var clone spec.Cluster
273+
clone.Spec = s.Spec
274+
clone.Metadata.SetName(s.Metadata.GetName())
275+
clone.Metadata.SetNamespace(s.Metadata.GetNamespace())
276+
clone.APIVersion = s.APIVersion
277+
clone.Kind = s.Kind
278+
279+
return clone
280+
}
281+
282+
func stripPort(hostport string) string {
283+
colon := strings.IndexByte(hostport, ':')
284+
if colon == -1 {
285+
return hostport
286+
}
287+
if i := strings.IndexByte(hostport, ']'); i != -1 {
288+
return strings.TrimPrefix(hostport[:i], "[")
289+
}
290+
return hostport[:colon]
291+
}

0 commit comments

Comments
 (0)