Skip to content

Commit d8c0929

Browse files
authored
Check cloud-init status as part of machine provisioning (#166)
* Check cloud-init status and throw error state if cloud-init fails
1 parent f26d055 commit d8c0929

File tree

7 files changed

+268
-0
lines changed

7 files changed

+268
-0
lines changed

internal/service/vmservice/vm.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/ionos-cloud/cluster-api-provider-proxmox/internal/service/scheduler"
3434
"github.com/ionos-cloud/cluster-api-provider-proxmox/internal/service/taskservice"
3535
"github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/proxmox"
36+
"github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/proxmox/goproxmox"
3637
"github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/scope"
3738
)
3839

@@ -91,6 +92,10 @@ func ReconcileVM(ctx context.Context, scope *scope.MachineScope) (infrav1alpha1.
9192
return vm, err
9293
}
9394

95+
if requeue, err := checkCloudInitStatus(ctx, scope); err != nil || requeue {
96+
return vm, err
97+
}
98+
9499
// if the root machine is ready, we can assume that the VM is ready as well.
95100
// unmount the cloud-init iso if it is still mounted.
96101
if scope.Machine.Status.BootstrapReady && scope.Machine.Status.NodeRef != nil {
@@ -103,6 +108,27 @@ func ReconcileVM(ctx context.Context, scope *scope.MachineScope) (infrav1alpha1.
103108
return vm, nil
104109
}
105110

111+
func checkCloudInitStatus(ctx context.Context, machineScope *scope.MachineScope) (requeue bool, err error) {
112+
if !machineScope.VirtualMachine.IsRunning() {
113+
// skip if the vm is not running.
114+
return true, nil
115+
}
116+
117+
if running, err := machineScope.InfraCluster.ProxmoxClient.CloudInitStatus(ctx, machineScope.VirtualMachine); err != nil || running {
118+
if running {
119+
return true, nil
120+
}
121+
if errors.Is(goproxmox.ErrCloudInitFailed, err) {
122+
conditions.MarkFalse(machineScope.ProxmoxMachine, infrav1alpha1.VMProvisionedCondition, infrav1alpha1.VMProvisionFailedReason, clusterv1.ConditionSeverityError, err.Error())
123+
machineScope.SetFailureMessage(err)
124+
machineScope.SetFailureReason(capierrors.MachineStatusError("BootstrapFailed"))
125+
}
126+
return false, err
127+
}
128+
129+
return false, nil
130+
}
131+
106132
// ensureVirtualMachine creates a Proxmox VM if it doesn't exist and updates the given MachineScope.
107133
func ensureVirtualMachine(ctx context.Context, machineScope *scope.MachineScope) (requeue bool, err error) {
108134
// if there's an associated task, requeue.

internal/service/vmservice/vm_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@ import (
2323

2424
"github.com/stretchr/testify/require"
2525
"k8s.io/utils/ptr"
26+
capierrors "sigs.k8s.io/cluster-api/errors"
2627

2728
infrav1alpha1 "github.com/ionos-cloud/cluster-api-provider-proxmox/api/v1alpha1"
2829
"github.com/ionos-cloud/cluster-api-provider-proxmox/internal/service/scheduler"
2930
"github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/proxmox"
31+
"github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/proxmox/goproxmox"
3032
"github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/scope"
3133
)
3234

@@ -36,8 +38,10 @@ func TestReconcileVM_EverythingReady(t *testing.T) {
3638
machineScope.SetVirtualMachineID(int64(vm.VMID))
3739
machineScope.ProxmoxMachine.Status.IPAddresses = map[string]infrav1alpha1.IPAddress{infrav1alpha1.DefaultNetworkDevice: {IPV4: "10.10.10.10"}}
3840
machineScope.ProxmoxMachine.Status.BootstrapDataProvided = ptr.To(true)
41+
machineScope.ProxmoxMachine.Status.Ready = true
3942

4043
proxmoxClient.EXPECT().GetVM(context.Background(), "node1", int64(123)).Return(vm, nil).Once()
44+
proxmoxClient.EXPECT().CloudInitStatus(context.Background(), vm).Return(false, nil).Once()
4145

4246
result, err := ReconcileVM(context.Background(), machineScope)
4347
require.NoError(t, err)
@@ -308,3 +312,36 @@ func TestReconcileDisks_UnmountCloudInitISO(t *testing.T) {
308312

309313
require.NoError(t, unmountCloudInitISO(context.Background(), machineScope))
310314
}
315+
316+
func TestReconcileVM_CloudInitFailed(t *testing.T) {
317+
machineScope, proxmoxClient, _ := setupReconcilerTest(t)
318+
vm := newRunningVM()
319+
machineScope.SetVirtualMachineID(int64(vm.VMID))
320+
machineScope.ProxmoxMachine.Status.IPAddresses = map[string]infrav1alpha1.IPAddress{infrav1alpha1.DefaultNetworkDevice: {IPV4: "10.10.10.10"}}
321+
machineScope.ProxmoxMachine.Status.BootstrapDataProvided = ptr.To(true)
322+
machineScope.ProxmoxMachine.Status.Ready = true
323+
324+
proxmoxClient.EXPECT().GetVM(context.Background(), "node1", int64(123)).Return(vm, nil).Once()
325+
proxmoxClient.EXPECT().CloudInitStatus(context.Background(), vm).Return(false, goproxmox.ErrCloudInitFailed).Once()
326+
327+
_, err := ReconcileVM(context.Background(), machineScope)
328+
require.Error(t, err, "unknown error")
329+
require.Equal(t, machineScope.ProxmoxMachine.Status.FailureReason, ptr.To(capierrors.MachineStatusError("BootstrapFailed")))
330+
require.Equal(t, machineScope.ProxmoxMachine.Status.FailureMessage, ptr.To("cloud-init failed execution"))
331+
}
332+
333+
func TestReconcileVM_CloudInitRunning(t *testing.T) {
334+
machineScope, proxmoxClient, _ := setupReconcilerTest(t)
335+
vm := newRunningVM()
336+
machineScope.SetVirtualMachineID(int64(vm.VMID))
337+
machineScope.ProxmoxMachine.Status.IPAddresses = map[string]infrav1alpha1.IPAddress{infrav1alpha1.DefaultNetworkDevice: {IPV4: "10.10.10.10"}}
338+
machineScope.ProxmoxMachine.Status.BootstrapDataProvided = ptr.To(true)
339+
machineScope.ProxmoxMachine.Status.Ready = true
340+
341+
proxmoxClient.EXPECT().GetVM(context.Background(), "node1", int64(123)).Return(vm, nil).Once()
342+
proxmoxClient.EXPECT().CloudInitStatus(context.Background(), vm).Return(true, nil).Once()
343+
344+
result, err := ReconcileVM(context.Background(), machineScope)
345+
require.NoError(t, err)
346+
require.Equal(t, infrav1alpha1.VirtualMachineStatePending, result.State)
347+
}

pkg/proxmox/client.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,6 @@ type Client interface {
4848
TagVM(ctx context.Context, vm *proxmox.VirtualMachine, tag string) (*proxmox.Task, error)
4949

5050
UnmountCloudInitISO(ctx context.Context, vm *proxmox.VirtualMachine, device string) error
51+
52+
CloudInitStatus(ctx context.Context, vm *proxmox.VirtualMachine) (bool, error)
5153
}

pkg/proxmox/goproxmox/api_client.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@ import (
2121
"context"
2222
"fmt"
2323
"net/url"
24+
"strings"
2425

2526
"github.com/go-logr/logr"
2627
"github.com/luthermonson/go-proxmox"
28+
"github.com/pkg/errors"
2729

2830
capmox "github.com/ionos-cloud/cluster-api-provider-proxmox/pkg/proxmox"
2931
)
@@ -258,3 +260,29 @@ func (c *APIClient) UnmountCloudInitISO(ctx context.Context, vm *proxmox.Virtual
258260
}
259261
return err
260262
}
263+
264+
// CloudInitStatus returns the cloud-init status of the VM.
265+
func (c *APIClient) CloudInitStatus(ctx context.Context, vm *proxmox.VirtualMachine) (running bool, err error) {
266+
if err := vm.WaitForAgent(ctx, 5); err != nil {
267+
return false, errors.Wrap(err, "error waiting for agent")
268+
}
269+
270+
pid, err := vm.AgentExec(ctx, []string{"cloud-init", "status"}, "")
271+
if err != nil {
272+
return false, errors.Wrap(err, "unable to get cloud-init status")
273+
}
274+
275+
status, err := vm.WaitForAgentExecExit(ctx, pid, 2)
276+
if err != nil {
277+
return false, errors.Wrap(err, "unable to wait for agent exec")
278+
}
279+
280+
if status.Exited == 1 && status.ExitCode == 0 && strings.Contains(status.OutData, "running") {
281+
return true, nil
282+
}
283+
if status.Exited == 1 && status.ExitCode != 0 {
284+
return false, ErrCloudInitFailed
285+
}
286+
287+
return false, nil
288+
}

pkg/proxmox/goproxmox/api_client_test.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package goproxmox
1818

1919
import (
2020
"context"
21+
"fmt"
2122
"net/http"
2223
"testing"
2324

@@ -150,3 +151,115 @@ func TestProxmoxAPIClient_GetReservableMemoryBytes(t *testing.T) {
150151
})
151152
}
152153
}
154+
155+
func TestProxmoxAPIClient_CloudInitStatus(t *testing.T) {
156+
tests := []struct {
157+
name string
158+
node string // node name
159+
vmid int64 // vmid
160+
pid float64 // pid of agent
161+
exited int // exited state
162+
exitcode int // exitcode
163+
outData string // out-data
164+
running bool // expected running state
165+
err error // expected error
166+
}{
167+
{
168+
name: "cloud-init success",
169+
node: "pve",
170+
vmid: 1111,
171+
pid: 12234,
172+
exited: 1,
173+
exitcode: 0,
174+
outData: "status: done\n",
175+
running: false,
176+
err: nil,
177+
},
178+
{
179+
name: "cloud-init running",
180+
node: "pve",
181+
vmid: 1111,
182+
pid: 12234,
183+
exited: 1,
184+
exitcode: 0,
185+
outData: "status: running\n",
186+
running: true,
187+
err: nil,
188+
},
189+
{
190+
name: "cloud-init failed",
191+
node: "pve",
192+
vmid: 1111,
193+
pid: 12234,
194+
exited: 1,
195+
exitcode: 1,
196+
outData: "status: error\n",
197+
running: false,
198+
err: ErrCloudInitFailed,
199+
},
200+
}
201+
202+
for _, test := range tests {
203+
t.Run(test.name, func(t *testing.T) {
204+
client := newTestClient(t)
205+
206+
httpmock.RegisterResponder(http.MethodGet, fmt.Sprintf(`=~/nodes/%s/status`, test.node),
207+
newJSONResponder(200, proxmox.Node{Name: "pve"}))
208+
209+
httpmock.RegisterResponder(http.MethodGet, fmt.Sprintf(`=~/nodes/%s/qemu/%d/status/current`, test.node, test.vmid),
210+
newJSONResponder(200, proxmox.VirtualMachine{
211+
VMID: proxmox.StringOrUint64(test.vmid),
212+
Name: "legit-worker",
213+
Node: test.node,
214+
}))
215+
216+
httpmock.RegisterResponder(http.MethodGet, fmt.Sprintf(`=~/nodes/%s/qemu/%d/config`, test.node, test.vmid),
217+
newJSONResponder(200, proxmox.VirtualMachineConfig{
218+
Name: "legit-worker",
219+
}))
220+
221+
vm, err := client.GetVM(context.Background(), test.node, test.vmid)
222+
require.NoError(t, err)
223+
require.NotNil(t, vm)
224+
225+
// WaitForAgent mock
226+
httpmock.RegisterResponder(http.MethodGet, fmt.Sprintf(`=~/nodes/%s/qemu/%d/agent/get-osinfo`, vm.Node, vm.VMID),
227+
newJSONResponder(200,
228+
map[string]*proxmox.AgentOsInfo{
229+
"result": {
230+
ID: "ubuntu",
231+
VersionID: "22.04",
232+
Machine: "x86_64",
233+
KernelRelease: "5.15.0-89-generic",
234+
KernelVersion: "#99-Ubuntu SMP Mon Oct 30 20:42:41 UTC 2023",
235+
Name: "Ubuntu",
236+
Version: "22.04.3 LTS (Jammy Jellyfish)",
237+
PrettyName: "Ubuntu 22.04.3 LTS",
238+
},
239+
},
240+
))
241+
242+
// AgentExec mock
243+
httpmock.RegisterResponder(http.MethodPost, fmt.Sprintf(`=~/nodes/%s/qemu/%d/agent/exec\z`, vm.Node, vm.VMID),
244+
newJSONResponder(200,
245+
map[string]interface{}{
246+
"pid": test.pid,
247+
},
248+
))
249+
250+
// AgentExecStatus mock
251+
httpmock.RegisterResponder(http.MethodGet, fmt.Sprintf(`=~/nodes/%s/qemu/%d/agent/exec-status\?pid=%v`, vm.Node, vm.VMID, test.pid),
252+
newJSONResponder(200,
253+
&proxmox.AgentExecStatus{
254+
Exited: test.exited,
255+
ExitCode: test.exitcode,
256+
OutData: test.outData,
257+
},
258+
))
259+
260+
running, err := client.CloudInitStatus(context.Background(), vm)
261+
require.Equal(t, err, test.err)
262+
require.Equal(t, test.running, running)
263+
})
264+
}
265+
}

pkg/proxmox/goproxmox/errors.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
package goproxmox
2+
3+
import "github.com/pkg/errors"
4+
5+
var (
6+
// ErrCloudInitFailed is returned when cloud-init failed execution.
7+
ErrCloudInitFailed = errors.New("cloud-init failed execution")
8+
)

pkg/proxmox/proxmoxtest/mock_client.go

Lines changed: 54 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)