Skip to content

Commit c6a5b47

Browse files
authored
Merge pull request #8 from dippynark/improve-remediation-and-cluster-deletion
Improve remediation and cluster deletion
2 parents 731d0dd + 5691b90 commit c6a5b47

File tree

4 files changed

+36
-15
lines changed

4 files changed

+36
-15
lines changed

config/rbac/role.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ rules:
3737
- pods
3838
verbs:
3939
- create
40+
- delete
4041
- get
4142
- list
4243
- watch

controllers/kubernetesmachine_controller.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ type KubernetesMachineReconciler struct {
131131
// +kubebuilder:rbac:groups=infrastructure.dippynark.co.uk,resources=kubernetesmachines,verbs=get;list;watch;create;update;patch;delete
132132
// +kubebuilder:rbac:groups=infrastructure.dippynark.co.uk,resources=kubernetesmachines/status,verbs=get;update;patch
133133
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;list;watch
134-
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create
134+
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;delete
135135
// +kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create
136136
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create
137137
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create
@@ -215,7 +215,7 @@ func (r *KubernetesMachineReconciler) Reconcile(req ctrl.Request) (_ ctrl.Result
215215

216216
// Handle deleted machines
217217
if !kubernetesMachine.ObjectMeta.DeletionTimestamp.IsZero() {
218-
return r.reconcileDelete(ctx, machine, kubernetesMachine)
218+
return r.reconcileDelete(ctx, machine, kubernetesMachine, cluster)
219219
}
220220

221221
// Make sure infrastructure is ready
@@ -511,6 +511,13 @@ func (r *KubernetesMachineReconciler) reconcileNormal(ctx context.Context, clust
511511
return ctrl.Result{}, nil
512512
}
513513
if kindContainerStatus.State.Terminated != nil {
514+
515+
if kubernetesMachine.Spec.AllowRecreation {
516+
// Delete Pod to allow it to be recreated
517+
log.Info("Deleting Pod due to terminated kind container")
518+
return ctrl.Result{}, r.Delete(ctx, machinePod)
519+
}
520+
514521
kubernetesMachine.Status.SetFailureReason(capierrors.UnsupportedChangeMachineError)
515522
kubernetesMachine.Status.SetFailureMessage(errors.Errorf("kind container has terminated: %s", kindContainerStatus.State.Terminated.Reason))
516523

@@ -552,11 +559,11 @@ func (r *KubernetesMachineReconciler) reconcileNormal(ctx context.Context, clust
552559
return ctrl.Result{}, nil
553560
}
554561

555-
func (r *KubernetesMachineReconciler) reconcileDelete(ctx context.Context, machine *clusterv1.Machine, kubernetesMachine *capkv1.KubernetesMachine) (ctrl.Result, error) {
556-
// If the deleted machine is a control-plane node, exec kubeadm reset so the
557-
// etcd member hosted on the machine gets removed in a controlled way
558-
559-
if util.IsControlPlaneMachine(machine) {
562+
func (r *KubernetesMachineReconciler) reconcileDelete(ctx context.Context, machine *clusterv1.Machine, kubernetesMachine *capkv1.KubernetesMachine, cluster *clusterv1.Cluster) (ctrl.Result, error) {
563+
// If the deleted machine is a control plane node, exec kubeadm reset so the etcd member hosted on
564+
// the machine gets removed in a controlled way. If the cluster has been deleted then we skip this
565+
// step to stop it hanging forever in the case of control plane failure
566+
if cluster.ObjectMeta.DeletionTimestamp.IsZero() && util.IsControlPlaneMachine(machine) {
560567
// Check if machine pod exists
561568
machinePod := &corev1.Pod{}
562569
err := r.Client.Get(ctx, types.NamespacedName{

docs/flavors.md

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,17 @@ The default flavor creates a Kubernetes cluster with the controller Nodes manage
1111
[KubeadmControlPlane](https://github.com/kubernetes-sigs/cluster-api/blob/master/docs/proposals/20191017-kubeadm-based-control-plane.md)
1212
resource and the worker Nodes managed by a
1313
[MachineDeployment](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-deployment.html)
14-
resource. The controller Nodes write etcd state to the container file system and the corresponding
15-
KubernetesMachines will fail if the underlying Pods fails, relying on the
14+
resource.
15+
16+
The controller Nodes write etcd state to an
17+
[emptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) volume and the
18+
corresponding KubernetesMachines will fail if the underlying Pods fail, relying on the
1619
KubeadmControlPlane for remediation.
1720

21+
The worker Nodes will fail if their underlying Pods fail, relying on the MachineDeployment and a
22+
[MachineHealthCheck](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-health-check.html)
23+
for remediation.
24+
1825
```sh
1926
CLUSTER_NAME="example"
2027
export KUBERNETES_CONTROL_PLANE_SERVICE_TYPE="LoadBalancer"
@@ -48,14 +55,20 @@ by a
4855
[KubeadmControlPlane](https://github.com/kubernetes-sigs/cluster-api/blob/master/docs/proposals/20191017-kubeadm-based-control-plane.md)
4956
resource and the worker Nodes managed by a
5057
[MachineDeployment](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-deployment.html)
51-
resource. PersistentVolumes are dynamically provisioned for the controller Nodes to write etcd state
52-
and the corresponding KubernetesMachines are configured to recreate the underlying Pod if it is
53-
deleted as described in [persistence.md](persistence.md).
58+
resource.
59+
60+
PersistentVolumes are dynamically provisioned for the controller Nodes to write etcd state and the
61+
corresponding KubernetesMachines are configured to recreate their underlying Pods if they fail as
62+
described in [persistence.md](persistence.md).
63+
64+
The worker Nodes will fail if their underlying Pods fail, relying on the MachineDeployment and a
65+
[MachineHealthCheck](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-health-check.html)
66+
for remediation.
5467

5568
```sh
5669
CLUSTER_NAME="example"
5770
export KUBERNETES_CONTROL_PLANE_SERVICE_TYPE="LoadBalancer"
58-
export ETCD_STORAGE_CLASS_NAME="ssd"
71+
export ETCD_STORAGE_CLASS_NAME="premium-rwo"
5972
export ETCD_STORAGE_SIZE="1Gi"
6073
clusterctl config cluster $CLUSTER_NAME \
6174
--infrastructure kubernetes \

docs/persistence.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Persistence
22

3-
By default when the Pod backing a KubernetesMachine is deleted, the KubernetesMachine (and therefore
4-
the managing Machine) will be set to failed. However, if the
3+
By default when the Pod backing a KubernetesMachine fails or is deleted, the KubernetesMachine (and
4+
therefore the managing Machine) will be set to failed. However, if the
55
`kubernetesMachine.spec.allowRecreation` field is set to `true`, the Pod will instead be recreated
66
with the same name. For controller Machines, by mounting a PersistentVolume at the etcd data
77
directory, the Pod can recover without data loss and without the managing KubernetesMachine failing:

0 commit comments

Comments
 (0)