Skip to content

Commit 02d7d4b

Browse files
committed
Add termination condition to deleting nodes on OOT
1 parent f81674f commit 02d7d4b

File tree

3 files changed

+147
-54
lines changed

3 files changed

+147
-54
lines changed

pkg/util/provider/machinecontroller/machine_test.go

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
. "github.com/onsi/ginkgo/extensions/table"
3232
. "github.com/onsi/gomega"
3333
corev1 "k8s.io/api/core/v1"
34+
v1 "k8s.io/api/core/v1"
3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3536
"k8s.io/apimachinery/pkg/runtime"
3637
"k8s.io/apimachinery/pkg/util/validation/field"
@@ -763,10 +764,11 @@ var _ = Describe("machine", func() {
763764
fakeDriver *driver.FakeDriver
764765
}
765766
type expect struct {
766-
machine *v1alpha1.Machine
767-
err error
768-
nodeDeleted bool
769-
retry machineutils.Retry
767+
machine *v1alpha1.Machine
768+
err error
769+
nodeTerminationConditionIsSet bool
770+
nodeDeleted bool
771+
retry machineutils.Retry
770772
}
771773
type data struct {
772774
setup setup
@@ -844,6 +846,13 @@ var _ = Describe("machine", func() {
844846
_, nodeErr := controller.targetCoreClient.CoreV1().Nodes().Get(machine.Status.Node, metav1.GetOptions{})
845847
Expect(nodeErr).To(HaveOccurred())
846848
}
849+
if data.expect.nodeTerminationConditionIsSet {
850+
node, nodeErr := controller.targetCoreClient.CoreV1().Nodes().Get(machine.Status.Node, metav1.GetOptions{})
851+
Expect(nodeErr).To(Not(HaveOccurred()))
852+
Expect(len(node.Status.Conditions)).To(Equal(1))
853+
Expect(node.Status.Conditions[0].Type).To(Equal(machineutils.NodeTerminationCondition))
854+
Expect(node.Status.Conditions[0].Status).To(Equal(v1.ConditionTrue))
855+
}
847856

848857
},
849858
Entry("Do not process machine deletion for object without finalizer", &data{
@@ -1174,23 +1183,31 @@ var _ = Describe("machine", func() {
11741183
machineutils.MachinePriority: "3",
11751184
},
11761185
map[string]string{
1177-
"node": "fakeID-0",
1186+
"node": "fakeNode-0",
11781187
},
11791188
true,
11801189
),
1190+
nodes: []*corev1.Node{
1191+
{
1192+
ObjectMeta: metav1.ObjectMeta{
1193+
Name: "fakeNode-0",
1194+
},
1195+
},
1196+
},
11811197
},
11821198
action: action{
11831199
machine: "machine-0",
11841200
fakeDriver: &driver.FakeDriver{
11851201
VMExists: true,
1186-
ProviderID: "fakeID-0",
1202+
ProviderID: "fakeID",
11871203
NodeName: "fakeNode-0",
11881204
Err: nil,
11891205
},
11901206
},
11911207
expect: expect{
1192-
err: fmt.Errorf("Machine deletion in process. Drain successful. %s", machineutils.InitiateVMDeletion),
1193-
retry: machineutils.RetryOp,
1208+
err: fmt.Errorf("Machine deletion in process. Drain successful. %s", machineutils.InitiateVMDeletion),
1209+
retry: machineutils.RetryOp,
1210+
nodeTerminationConditionIsSet: true,
11941211
machine: newMachine(
11951212
&v1alpha1.MachineTemplateSpec{
11961213
ObjectMeta: *newObjectMeta(objMeta, 0),
@@ -1645,7 +1662,7 @@ var _ = Describe("machine", func() {
16451662
),
16461663
},
16471664
}),
1648-
Entry("Drain machine failure", &data{
1665+
Entry("Drain machine failure due to node update failure", &data{
16491666
setup: setup{
16501667
secrets: []*corev1.Secret{
16511668
{
@@ -1688,14 +1705,14 @@ var _ = Describe("machine", func() {
16881705
machineutils.MachinePriority: "3",
16891706
},
16901707
map[string]string{
1691-
"node": "fakeID-0",
1708+
"node": "fakeNode-0",
16921709
},
16931710
true,
16941711
),
16951712
nodes: []*corev1.Node{
16961713
{
16971714
ObjectMeta: metav1.ObjectMeta{
1698-
Name: "fakeID-0",
1715+
Name: "fakeNode-0",
16991716
},
17001717
},
17011718
},
@@ -1715,7 +1732,7 @@ var _ = Describe("machine", func() {
17151732
},
17161733
},
17171734
expect: expect{
1718-
err: fmt.Errorf("Failed to update node"),
1735+
err: fmt.Errorf("failed to create update conditions for node \"fakeNode-0\": Failed to update node"),
17191736
retry: machineutils.RetryOp,
17201737
machine: newMachine(
17211738
&v1alpha1.MachineTemplateSpec{
@@ -1735,7 +1752,7 @@ var _ = Describe("machine", func() {
17351752
LastUpdateTime: metav1.Now(),
17361753
},
17371754
LastOperation: v1alpha1.LastOperation{
1738-
Description: fmt.Sprintf("Drain failed due to - Failed to update node. Will retry in next sync. %s", machineutils.InitiateDrain),
1755+
Description: fmt.Sprintf("Drain failed due to failure in update of node conditions - %s. Will retry in next sync. %s", "failed to create update conditions for node \"fakeNode-0\": Failed to update node", machineutils.InitiateDrain),
17391756
State: v1alpha1.MachineStateFailed,
17401757
Type: v1alpha1.MachineOperationDelete,
17411758
LastUpdateTime: metav1.Now(),
@@ -1873,7 +1890,7 @@ var _ = Describe("machine", func() {
18731890
},
18741891
},
18751892
&v1alpha1.MachineStatus{
1876-
Node: "fakeNode",
1893+
Node: "fakeID",
18771894
CurrentStatus: v1alpha1.CurrentStatus{
18781895
Phase: v1alpha1.MachineTerminating,
18791896
LastUpdateTime: metav1.Now(),

pkg/util/provider/machinecontroller/machine_util.go

Lines changed: 107 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232

3333
machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine"
3434
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
35+
"github.com/gardener/machine-controller-manager/pkg/util/nodeops"
3536
"github.com/gardener/machine-controller-manager/pkg/util/provider/drain"
3637
"github.com/gardener/machine-controller-manager/pkg/util/provider/driver"
3738
"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
@@ -889,50 +890,67 @@ func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest
889890
)
890891
}
891892

892-
buf := bytes.NewBuffer([]byte{})
893-
errBuf := bytes.NewBuffer([]byte{})
894-
895-
drainOptions := drain.NewDrainOptions(
896-
c.targetCoreClient,
897-
timeOutDuration,
898-
maxEvictRetries,
899-
pvDetachTimeOut,
900-
nodeName,
901-
-1,
902-
forceDeletePods,
903-
true,
904-
true,
905-
true,
906-
buf,
907-
errBuf,
908-
c.driver,
909-
c.pvcLister,
910-
c.pvLister,
911-
)
912-
err = drainOptions.RunDrain()
913-
if err == nil {
914-
// Drain successful
915-
klog.V(2).Infof("Drain successful for machine %q. \nBuf:%v \nErrBuf:%v", machine.Name, buf, errBuf)
893+
// update node with the machine's state prior to termination
894+
if err = c.UpdateNodeTerminationCondition(machine); err != nil {
895+
if forceDeleteMachine {
896+
klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err)
897+
} else {
898+
klog.Errorf("Drain failed due to failure in update of node conditions: %v", err)
916899

917-
description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion)
918-
state = v1alpha1.MachineStateProcessing
919-
phase = v1alpha1.MachineTerminating
900+
description = fmt.Sprintf("Drain failed due to failure in update of node conditions - %s. Will retry in next sync. %s", err.Error(), machineutils.InitiateDrain)
901+
state = v1alpha1.MachineStateFailed
902+
phase = v1alpha1.MachineTerminating
920903

921-
// Return error even when machine object is updated
922-
err = fmt.Errorf("Machine deletion in process. " + description)
923-
} else if err != nil && forceDeleteMachine {
924-
// Drain failed on force deletion
925-
klog.Warningf("Drain failed for machine %q. However, since it's a force deletion shall continue deletion of VM. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err)
904+
skipDrain = true
905+
}
906+
}
926907

927-
description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.InitiateVMDeletion)
928-
state = v1alpha1.MachineStateProcessing
929-
phase = v1alpha1.MachineTerminating
930-
} else {
931-
klog.Warningf("Drain failed for machine %q. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err)
908+
if !skipDrain {
909+
buf := bytes.NewBuffer([]byte{})
910+
errBuf := bytes.NewBuffer([]byte{})
911+
912+
drainOptions := drain.NewDrainOptions(
913+
c.targetCoreClient,
914+
timeOutDuration,
915+
maxEvictRetries,
916+
pvDetachTimeOut,
917+
nodeName,
918+
-1,
919+
forceDeletePods,
920+
true,
921+
true,
922+
true,
923+
buf,
924+
errBuf,
925+
c.driver,
926+
c.pvcLister,
927+
c.pvLister,
928+
)
929+
err = drainOptions.RunDrain()
930+
if err == nil {
931+
// Drain successful
932+
klog.V(2).Infof("Drain successful for machine %q. \nBuf:%v \nErrBuf:%v", machine.Name, buf, errBuf)
932933

933-
description = fmt.Sprintf("Drain failed due to - %s. Will retry in next sync. %s", err.Error(), machineutils.InitiateDrain)
934-
state = v1alpha1.MachineStateFailed
935-
phase = v1alpha1.MachineTerminating
934+
description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion)
935+
state = v1alpha1.MachineStateProcessing
936+
phase = v1alpha1.MachineTerminating
937+
938+
// Return error even when machine object is updated
939+
err = fmt.Errorf("Machine deletion in process. " + description)
940+
} else if err != nil && forceDeleteMachine {
941+
// Drain failed on force deletion
942+
klog.Warningf("Drain failed for machine %q. However, since it's a force deletion shall continue deletion of VM. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err)
943+
944+
description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.InitiateVMDeletion)
945+
state = v1alpha1.MachineStateProcessing
946+
phase = v1alpha1.MachineTerminating
947+
} else {
948+
klog.Warningf("Drain failed for machine %q. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err)
949+
950+
description = fmt.Sprintf("Drain failed due to - %s. Will retry in next sync. %s", err.Error(), machineutils.InitiateDrain)
951+
state = v1alpha1.MachineStateFailed
952+
phase = v1alpha1.MachineTerminating
953+
}
936954
}
937955
}
938956

@@ -1139,3 +1157,52 @@ func (c *controller) getEffectiveNodeConditions(machine *v1alpha1.Machine) *stri
11391157
}
11401158
return effectiveNodeConditions
11411159
}
1160+
1161+
// UpdateNodeTerminationCondition updates termination condition on the node object
1162+
func (c *controller) UpdateNodeTerminationCondition(machine *v1alpha1.Machine) error {
1163+
if machine.Status.CurrentStatus.Phase == "" {
1164+
return nil
1165+
}
1166+
1167+
nodeName := machine.Labels["node"]
1168+
1169+
terminationCondition := v1.NodeCondition{
1170+
Type: machineutils.NodeTerminationCondition,
1171+
Status: v1.ConditionTrue,
1172+
LastHeartbeatTime: metav1.Now(),
1173+
LastTransitionTime: metav1.Now(),
1174+
}
1175+
1176+
// check if condition already exists
1177+
cond, err := nodeops.GetNodeCondition(c.targetCoreClient, nodeName, machineutils.NodeTerminationCondition)
1178+
if err != nil {
1179+
if apierrors.IsNotFound(err) {
1180+
return nil
1181+
}
1182+
return err
1183+
}
1184+
1185+
if cond != nil && machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating {
1186+
// do not consider machine terminating phase if node already terminating
1187+
terminationCondition.Reason = cond.Reason
1188+
terminationCondition.Message = cond.Message
1189+
} else {
1190+
setTerminationReasonByPhase(machine.Status.CurrentStatus.Phase, &terminationCondition)
1191+
}
1192+
1193+
err = nodeops.AddOrUpdateConditionsOnNode(c.targetCoreClient, nodeName, terminationCondition)
1194+
if apierrors.IsNotFound(err) {
1195+
return nil
1196+
}
1197+
return err
1198+
}
1199+
1200+
func setTerminationReasonByPhase(phase v1alpha1.MachinePhase, terminationCondition *v1.NodeCondition) {
1201+
if phase == v1alpha1.MachineFailed { // if failed, terminated due to health
1202+
terminationCondition.Reason = machineutils.NodeUnhealthy
1203+
terminationCondition.Message = "Machine Controller is terminating failed machine"
1204+
} else { // in all other cases (except for already terminating): assume scale down
1205+
terminationCondition.Reason = machineutils.NodeScaledDown
1206+
terminationCondition.Message = "Machine Controller is scaling down machine"
1207+
}
1208+
}

pkg/util/provider/machineutils/utils.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
// Package machineutils contains the consts and global vaariables for machine operation
1818
package machineutils
1919

20+
import v1 "k8s.io/api/core/v1"
21+
2022
const (
2123
// GetVMStatus sets machine status to terminating and specifies next step as getting VMs
2224
GetVMStatus = "Set machine status to termination. Now, getting VM Status"
@@ -40,6 +42,13 @@ const (
4042
MachineClassKind = "MachineClass"
4143
// MigratedMachineClass annotation helps in identifying machineClasses who have been migrated by migration controller
4244
MigratedMachineClass = "machine.sapcloud.io/migrated"
45+
46+
// NodeUnhealthy is a node termination reason for failed machines
47+
NodeUnhealthy = "Unhealthy"
48+
// NodeScaledDown is a node termination reason for healthy deleted machines
49+
NodeScaledDown = "ScaleDown"
50+
// NodeTerminationCondition describes nodes that are terminating
51+
NodeTerminationCondition v1.NodeConditionType = "Terminating"
4352
)
4453

4554
// Retry is a label for retrying operation

0 commit comments

Comments
 (0)