Skip to content

Commit 005a42b

Browse files
committed
feat(cluster-autoscaler): improve nodes listing in ClusterAPI provider
Add improved error handling for machines phase in the ClusterAPI node group implementation. When a machine is in Deleting/Failed/Pending phase, mark the cloudprovider.Instance with a status for cluster-autoscaler recovery actions. The changes: - Enhance Nodes listing to allow reporting the machine phase in Instance status - Add error status reporting for failed machines This change helps identify and manage failed machines more effectively, allowing the autoscaler to make better scaling decisions.
1 parent 8251159 commit 005a42b

File tree

2 files changed

+295
-2
lines changed

2 files changed

+295
-2
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,9 +253,82 @@ func (ng *nodegroup) Nodes() ([]cloudprovider.Instance, error) {
253253
// must match the ID on the Node object itself.
254254
// https://github.com/kubernetes/autoscaler/blob/a973259f1852303ba38a3a61eeee8489cf4e1b13/cluster-autoscaler/clusterstate/clusterstate.go#L967-L985
255255
instances := make([]cloudprovider.Instance, len(providerIDs))
256-
for i := range providerIDs {
256+
for i, providerID := range providerIDs {
257+
providerIDNormalized := normalizedProviderID(providerID)
258+
259+
// Add instance Status to report instance state to cluster-autoscaler.
260+
// This helps cluster-autoscaler make better scaling decisions.
261+
//
262+
// Machine can be Failed for a variety of reasons, here we are looking for a specific errors:
263+
// - Failed to provision
264+
// - Failed to delete.
265+
// Other reasons for a machine to be in a Failed state are not forwarding Status to the autoscaler.
266+
var status *cloudprovider.InstanceStatus
267+
switch {
268+
case isFailedMachineProviderID(providerIDNormalized):
269+
klog.V(4).Infof("Machine failed in node group %s (%s)", ng.Id(), providerID)
270+
271+
machine, err := ng.machineController.findMachineByProviderID(providerIDNormalized)
272+
if err != nil {
273+
return nil, err
274+
}
275+
276+
if machine != nil {
277+
if !machine.GetDeletionTimestamp().IsZero() {
278+
klog.V(4).Infof("Machine failed in node group %s (%s) is being deleted", ng.Id(), providerID)
279+
status = &cloudprovider.InstanceStatus{
280+
State: cloudprovider.InstanceDeleting,
281+
ErrorInfo: &cloudprovider.InstanceErrorInfo{
282+
ErrorClass: cloudprovider.OtherErrorClass,
283+
ErrorCode: "DeletingFailed",
284+
ErrorMessage: "Machine deletion failed",
285+
},
286+
}
287+
} else {
288+
_, nodeFound, err := unstructured.NestedFieldCopy(machine.UnstructuredContent(), "status", "nodeRef")
289+
if err != nil {
290+
return nil, err
291+
}
292+
293+
// Machine failed without a nodeRef, this indicates that the machine failed to provision.
294+
// This is a special case where the machine is in a Failed state and the node is not created.
295+
// Machine controller will not reconcile this machine, so we need to report this to the autoscaler.
296+
if !nodeFound {
297+
klog.V(4).Infof("Machine failed in node group %s (%s) was being created", ng.Id(), providerID)
298+
status = &cloudprovider.InstanceStatus{
299+
State: cloudprovider.InstanceCreating,
300+
ErrorInfo: &cloudprovider.InstanceErrorInfo{
301+
ErrorClass: cloudprovider.OtherErrorClass,
302+
ErrorCode: "ProvisioningFailed",
303+
ErrorMessage: "Machine provisioning failed",
304+
},
305+
}
306+
}
307+
}
308+
}
309+
310+
case isPendingMachineProviderID(providerIDNormalized):
311+
klog.V(4).Infof("Machine pending in node group %s (%s)", ng.Id(), providerID)
312+
status = &cloudprovider.InstanceStatus{
313+
State: cloudprovider.InstanceCreating,
314+
}
315+
316+
case isDeletingMachineProviderID(providerIDNormalized):
317+
klog.V(4).Infof("Machine deleting in node group %s (%s)", ng.Id(), providerID)
318+
status = &cloudprovider.InstanceStatus{
319+
State: cloudprovider.InstanceDeleting,
320+
}
321+
322+
default:
323+
klog.V(4).Infof("Machine running in node group %s (%s)", ng.Id(), providerID)
324+
status = &cloudprovider.InstanceStatus{
325+
State: cloudprovider.InstanceRunning,
326+
}
327+
}
328+
257329
instances[i] = cloudprovider.Instance{
258-
Id: providerIDs[i],
330+
Id: providerID,
331+
Status: status,
259332
}
260333
}
261334

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1736,3 +1736,223 @@ func TestNodeGroupGetOptions(t *testing.T) {
17361736
})
17371737
}
17381738
}
1739+
1740+
func TestNodeGroupNodesInstancesStatus(t *testing.T) {
1741+
type testCase struct {
1742+
description string
1743+
nodeCount int
1744+
includePendingMachine bool
1745+
includeDeletingMachine bool
1746+
includeFailedMachineWithNodeRef bool
1747+
includeFailedMachineWithoutNodeRef bool
1748+
includeFailedMachineDeleting bool
1749+
}
1750+
1751+
testCases := []testCase{
1752+
{
1753+
description: "standard number of nodes",
1754+
nodeCount: 5,
1755+
},
1756+
{
1757+
description: "includes a machine in pending state",
1758+
nodeCount: 5,
1759+
includePendingMachine: true,
1760+
},
1761+
{
1762+
description: "includes a machine in deleting state",
1763+
nodeCount: 5,
1764+
includeDeletingMachine: true,
1765+
},
1766+
{
1767+
description: "includes a machine in failed state with nodeRef",
1768+
nodeCount: 5,
1769+
includeFailedMachineWithNodeRef: true,
1770+
},
1771+
{
1772+
description: "includes a machine in failed state without nodeRef",
1773+
nodeCount: 5,
1774+
includeFailedMachineWithoutNodeRef: true,
1775+
},
1776+
}
1777+
1778+
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {
1779+
controller, stop := mustCreateTestController(t, testConfig)
1780+
defer stop()
1781+
1782+
if tc.includePendingMachine {
1783+
if tc.nodeCount < 1 {
1784+
t.Fatal("test cannot pass, deleted machine requires at least 1 machine in machineset")
1785+
}
1786+
1787+
machine := testConfig.machines[0].DeepCopy()
1788+
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
1789+
unstructured.RemoveNestedField(machine.Object, "status", "nodeRef")
1790+
1791+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1792+
t.Fatalf("unexpected error updating machine, got %v", err)
1793+
}
1794+
}
1795+
1796+
if tc.includeDeletingMachine {
1797+
if tc.nodeCount < 2 {
1798+
t.Fatal("test cannot pass, deleted machine requires at least 2 machine in machineset")
1799+
}
1800+
1801+
machine := testConfig.machines[1].DeepCopy()
1802+
timestamp := metav1.Now()
1803+
machine.SetDeletionTimestamp(&timestamp)
1804+
1805+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1806+
t.Fatalf("unexpected error updating machine, got %v", err)
1807+
}
1808+
}
1809+
1810+
if tc.includeFailedMachineWithNodeRef {
1811+
if tc.nodeCount < 3 {
1812+
t.Fatal("test cannot pass, deleted machine requires at least 3 machine in machineset")
1813+
}
1814+
1815+
machine := testConfig.machines[2].DeepCopy()
1816+
unstructured.SetNestedField(machine.Object, "node-1", "status", "nodeRef", "name")
1817+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
1818+
1819+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1820+
t.Fatalf("unexpected error updating machine, got %v", err)
1821+
}
1822+
}
1823+
1824+
if tc.includeFailedMachineWithoutNodeRef {
1825+
if tc.nodeCount < 4 {
1826+
t.Fatal("test cannot pass, deleted machine requires at least 4 machine in machineset")
1827+
}
1828+
1829+
machine := testConfig.machines[3].DeepCopy()
1830+
unstructured.RemoveNestedField(machine.Object, "status", "nodeRef")
1831+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
1832+
1833+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1834+
t.Fatalf("unexpected error updating machine, got %v", err)
1835+
}
1836+
}
1837+
1838+
if tc.includeFailedMachineDeleting {
1839+
if tc.nodeCount < 5 {
1840+
t.Fatal("test cannot pass, deleted machine requires at least 5 machine in machineset")
1841+
}
1842+
1843+
machine := testConfig.machines[4].DeepCopy()
1844+
timestamp := metav1.Now()
1845+
machine.SetDeletionTimestamp(&timestamp)
1846+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
1847+
1848+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1849+
t.Fatalf("unexpected error updating machine, got %v", err)
1850+
}
1851+
}
1852+
1853+
nodegroups, err := controller.nodeGroups()
1854+
if err != nil {
1855+
t.Fatalf("unexpected error: %v", err)
1856+
}
1857+
1858+
if l := len(nodegroups); l != 1 {
1859+
t.Fatalf("expected 1 nodegroup, got %d", l)
1860+
}
1861+
1862+
ng := nodegroups[0]
1863+
instances, err := ng.Nodes()
1864+
if err != nil {
1865+
t.Fatalf("unexpected error: %v", err)
1866+
}
1867+
1868+
expectedCount := tc.nodeCount
1869+
if len(instances) != expectedCount {
1870+
t.Errorf("expected %d nodes, got %d", expectedCount, len(instances))
1871+
}
1872+
1873+
// Sort instances by Id for stable comparison
1874+
sort.Slice(instances, func(i, j int) bool {
1875+
return instances[i].Id < instances[j].Id
1876+
})
1877+
1878+
for _, instance := range instances {
1879+
t.Logf("instance: %v", instance)
1880+
if tc.includePendingMachine && strings.HasPrefix(instance.Id, pendingMachinePrefix) {
1881+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceCreating {
1882+
t.Errorf("expected pending machine to have status %v, got %v", cloudprovider.InstanceCreating, instance.Status)
1883+
}
1884+
} else if tc.includeDeletingMachine && strings.HasPrefix(instance.Id, deletingMachinePrefix) {
1885+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceDeleting {
1886+
t.Errorf("expected deleting machine to have status %v, got %v", cloudprovider.InstanceDeleting, instance.Status)
1887+
}
1888+
} else if tc.includeFailedMachineWithNodeRef && strings.HasPrefix(instance.Id, failedMachinePrefix) {
1889+
if instance.Status != nil {
1890+
t.Errorf("expected failed machine with nodeRef to not have status, got %v", instance.Status)
1891+
}
1892+
} else if tc.includeFailedMachineWithoutNodeRef && strings.HasPrefix(instance.Id, failedMachinePrefix) {
1893+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceCreating {
1894+
t.Errorf("expected failed machine without nodeRef to have status %v, got %v", cloudprovider.InstanceCreating, instance.Status)
1895+
}
1896+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorClass != cloudprovider.OtherErrorClass {
1897+
t.Errorf("expected failed machine without nodeRef to have error class %v, got %v", cloudprovider.OtherErrorClass, instance.Status.ErrorInfo.ErrorClass)
1898+
}
1899+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorCode != "ProvisioningFailed" {
1900+
t.Errorf("expected failed machine without nodeRef to have error code %v, got %v", "ProvisioningFailed", instance.Status.ErrorInfo.ErrorCode)
1901+
}
1902+
} else if tc.includeFailedMachineDeleting && strings.HasPrefix(instance.Id, failedMachinePrefix) {
1903+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceDeleting {
1904+
t.Errorf("expected failed machine deleting to have status %v, got %v", cloudprovider.InstanceDeleting, instance.Status)
1905+
}
1906+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorClass != cloudprovider.OtherErrorClass {
1907+
t.Errorf("expected failed machine deleting to have error class %v, got %v", cloudprovider.OtherErrorClass, instance.Status.ErrorInfo.ErrorClass)
1908+
}
1909+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorCode != "DeletingFailed" {
1910+
t.Errorf("expected failed machine deleting to have error code %v, got %v", "DeletingFailed", instance.Status.ErrorInfo.ErrorCode)
1911+
}
1912+
}
1913+
}
1914+
}
1915+
1916+
annotations := map[string]string{
1917+
nodeGroupMinSizeAnnotationKey: "1",
1918+
nodeGroupMaxSizeAnnotationKey: "10",
1919+
}
1920+
1921+
t.Run("MachineSet", func(t *testing.T) {
1922+
for _, tc := range testCases {
1923+
t.Run(tc.description, func(t *testing.T) {
1924+
test(
1925+
t,
1926+
&tc,
1927+
createMachineSetTestConfig(
1928+
RandomString(6),
1929+
RandomString(6),
1930+
RandomString(6),
1931+
tc.nodeCount,
1932+
annotations,
1933+
nil,
1934+
),
1935+
)
1936+
})
1937+
}
1938+
})
1939+
1940+
t.Run("MachineDeployment", func(t *testing.T) {
1941+
for _, tc := range testCases {
1942+
t.Run(tc.description, func(t *testing.T) {
1943+
test(
1944+
t,
1945+
&tc,
1946+
createMachineDeploymentTestConfig(
1947+
RandomString(6),
1948+
RandomString(6),
1949+
RandomString(6),
1950+
tc.nodeCount,
1951+
annotations,
1952+
nil,
1953+
),
1954+
)
1955+
})
1956+
}
1957+
})
1958+
}

0 commit comments

Comments
 (0)