Skip to content

Commit a53c6da

Browse files
🌱 Refactor Hcloud Provisioning, use Status.BootState (#1650)
Co-authored-by: Dhairya Arora <[email protected]>
1 parent dbf908c commit a53c6da

17 files changed

+944
-296
lines changed

api/v1beta1/hcloudmachine_types.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,19 @@ type HCloudMachineStatus struct {
9696
// Conditions define the current service state of the HCloudMachine.
9797
// +optional
9898
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
99+
100+
// BootState indicates the current state during provisioning.
101+
//
102+
// The states will be:
103+
// "" -> BootToRealOS -> OperatingSystemRunning
104+
//
105+
// +optional
106+
BootState HCloudBootState `json:"bootState"`
107+
108+
// BootStateSince is the timestamp of the last change to BootState. It is used to timeout
109+
// provisioning if a state takes too long.
110+
// +optional
111+
BootStateSince metav1.Time `json:"bootStateSince,omitzero"`
99112
}
100113

101114
// HCloudMachine is the Schema for the hcloudmachines API.
@@ -128,6 +141,15 @@ func (r *HCloudMachine) SetConditions(conditions clusterv1.Conditions) {
128141
r.Status.Conditions = conditions
129142
}
130143

144+
// SetBootState sets Status.BootStates and updates Status.BootStateSince.
145+
func (r *HCloudMachine) SetBootState(bootState HCloudBootState) {
146+
if r.Status.BootState == bootState {
147+
return
148+
}
149+
r.Status.BootState = bootState
150+
r.Status.BootStateSince = metav1.Now()
151+
}
152+
131153
//+kubebuilder:object:root=true
132154

133155
// HCloudMachineList contains a list of HCloudMachine.

api/v1beta1/hcloudmachine_validation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import (
2222
"k8s.io/apimachinery/pkg/util/validation/field"
2323
)
2424

25-
func validateHCloudMachineSpec(oldSpec, newSpec HCloudMachineSpec) field.ErrorList {
25+
func validateHCloudMachineSpecUpdate(oldSpec, newSpec HCloudMachineSpec) field.ErrorList {
2626
var allErrs field.ErrorList
2727
// Type is immutable
2828
if !reflect.DeepEqual(oldSpec.Type, newSpec.Type) {

api/v1beta1/hcloudmachine_validation_test.go

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ type args struct {
2828
newSpec HCloudMachineSpec
2929
}
3030

31-
func TestValidateHCloudMachineSpec(t *testing.T) {
31+
func TestValidateHCloudMachineSpecUpdate(t *testing.T) {
3232
tests := []struct {
3333
name string
3434
args args
@@ -38,10 +38,12 @@ func TestValidateHCloudMachineSpec(t *testing.T) {
3838
name: "Immutable Type",
3939
args: args{
4040
oldSpec: HCloudMachineSpec{
41-
Type: "cpx11",
41+
ImageName: "ubuntu-24.04",
42+
Type: "cpx11",
4243
},
4344
newSpec: HCloudMachineSpec{
44-
Type: "cx21",
45+
ImageName: "ubuntu-24.04",
46+
Type: "cx21",
4547
},
4648
},
4749
want: field.Invalid(field.NewPath("spec", "type"), "cx21", "field is immutable"),
@@ -62,6 +64,7 @@ func TestValidateHCloudMachineSpec(t *testing.T) {
6264
name: "Immutable SSHKeys",
6365
args: args{
6466
oldSpec: HCloudMachineSpec{
67+
ImageName: "ubuntu-24.04",
6568
SSHKeys: []SSHKey{
6669
{
6770
Name: "ssh-key-1",
@@ -70,6 +73,7 @@ func TestValidateHCloudMachineSpec(t *testing.T) {
7073
},
7174
},
7275
newSpec: HCloudMachineSpec{
76+
ImageName: "ubuntu-24.04",
7377
SSHKeys: []SSHKey{
7478
{
7579
Name: "ssh-key-1",
@@ -97,9 +101,11 @@ func TestValidateHCloudMachineSpec(t *testing.T) {
97101
name: "Immutable PlacementGroupName",
98102
args: args{
99103
oldSpec: HCloudMachineSpec{
104+
ImageName: "ubuntu-24.04",
100105
PlacementGroupName: createPlacementGroupName("placement-group-1"),
101106
},
102107
newSpec: HCloudMachineSpec{
108+
ImageName: "ubuntu-24.04",
103109
PlacementGroupName: createPlacementGroupName("placement-group-2"),
104110
},
105111
},
@@ -124,24 +130,21 @@ func TestValidateHCloudMachineSpec(t *testing.T) {
124130
want: nil,
125131
},
126132
}
133+
127134
for _, tt := range tests {
128135
t.Run(tt.name, func(t *testing.T) {
129-
got := validateHCloudMachineSpec(tt.args.oldSpec, tt.args.newSpec)
136+
got := validateHCloudMachineSpecUpdate(tt.args.oldSpec, tt.args.newSpec)
130137

131-
if len(got) == 0 {
138+
if tt.want == nil {
132139
assert.Empty(t, got)
140+
return
133141
}
134142

135-
if len(got) > 1 {
136-
t.Errorf("got length: %d greater than 1", len(got))
137-
}
143+
assert.Equal(t, len(got), 1, "got length: %d greater than 1: %+v", len(got), got)
138144

139-
// assert if length of got is 1
140-
if len(got) == 1 {
141-
assert.Equal(t, tt.want.Type, got[0].Type)
142-
assert.Equal(t, tt.want.Field, got[0].Field)
143-
assert.Equal(t, tt.want.Detail, got[0].Detail)
144-
}
145+
assert.Equal(t, tt.want.Type, got[0].Type)
146+
assert.Equal(t, tt.want.Field, got[0].Field)
147+
assert.Equal(t, tt.want.Detail, got[0].Detail)
145148
})
146149
}
147150
}

api/v1beta1/hcloudmachine_webhook.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func (r *HCloudMachine) ValidateUpdate(old runtime.Object) (admission.Warnings,
8181
return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an HCloudMachine but got a %T", old))
8282
}
8383

84-
allErrs := validateHCloudMachineSpec(oldM.Spec, r.Spec)
84+
allErrs := validateHCloudMachineSpecUpdate(oldM.Spec, r.Spec)
8585

8686
return nil, aggregateObjErrors(r.GroupVersionKind().GroupKind(), r.Name, allErrs)
8787
}

api/v1beta1/types.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,3 +263,17 @@ func (s *HCloudNetworkSpec) IsZero() bool {
263263
}
264264
return true
265265
}
266+
267+
// HCloudBootState defines the boot state of an HCloud server.
268+
type HCloudBootState string
269+
270+
const (
271+
// HCloudBootStateUnset is the initial state when the boot state has not been set yet.
272+
HCloudBootStateUnset HCloudBootState = ""
273+
274+
// HCloudBootStateBootToRealOS indicates that the server is booting the operating system.
275+
HCloudBootStateBootToRealOS HCloudBootState = "BootToRealOS"
276+
277+
// HCloudBootStateOperatingSystemRunning indicates that the server is successfully running.
278+
HCloudBootStateOperatingSystemRunning HCloudBootState = "OperatingSystemRunning"
279+
)

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,20 @@ spec:
181181
- type
182182
type: object
183183
type: array
184+
bootState:
185+
description: |-
186+
BootState indicates the current state during provisioning.
187+
188+
189+
The states will be:
190+
"" -> BootToRealOS -> OperatingSystemRunning
191+
type: string
192+
bootStateSince:
193+
description: |-
194+
BootStateSince is the timestamp of the last change to BootState. It is used to timeout
195+
provisioning if a state takes too long.
196+
format: date-time
197+
type: string
184198
conditions:
185199
description: Conditions define the current service state of the HCloudMachine.
186200
items:

controllers/hcloudmachine_controller.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ import (
2525
"time"
2626

2727
"github.com/go-logr/logr"
28+
"github.com/google/go-cmp/cmp"
2829
apierrors "k8s.io/apimachinery/pkg/api/errors"
30+
"k8s.io/apimachinery/pkg/util/wait"
2931
"k8s.io/klog/v2"
3032
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3133
"sigs.k8s.io/cluster-api/util"
@@ -145,6 +147,8 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
145147
return reconcile.Result{}, fmt.Errorf("failed to create scope: %+v", err)
146148
}
147149

150+
initialHCloudMachine := hcloudMachine.DeepCopy()
151+
startReconcile := time.Now()
148152
// Always close the scope when exiting this function so we can persist any HCloudMachine changes.
149153
defer func() {
150154
if reterr != nil && errors.Is(reterr, hcloudclient.ErrUnauthorized) {
@@ -153,10 +157,66 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
153157
conditions.MarkTrue(hcloudMachine, infrav1.HCloudTokenAvailableCondition)
154158
}
155159

160+
// the Close() will use PatchHelper to store the changes.
156161
if err := machineScope.Close(ctx); err != nil {
157162
res = reconcile.Result{}
158163
reterr = errors.Join(reterr, err)
159164
}
165+
166+
if !cmp.Equal(initialHCloudMachine, hcloudMachine) {
167+
// The hcloudMachine was changed. Wait until the local cache contains the revision
168+
// which was created by above machineScope.Close().
169+
// We want to read our own writes.
170+
err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (done bool, err error) {
171+
// new resource, read from local cache
172+
latest := &infrav1.HCloudMachine{}
173+
getErr := r.Get(ctx, client.ObjectKeyFromObject(machineScope.HCloudMachine), latest)
174+
if apierrors.IsNotFound(getErr) {
175+
// the object was deleted. All is fine.
176+
return true, nil
177+
}
178+
if getErr != nil {
179+
return false, getErr
180+
}
181+
// When the ResourceVersion has changed, then it is very likely that the local
182+
// cache has the new version.
183+
return latest.ResourceVersion != hcloudMachine.ResourceVersion, nil
184+
})
185+
if err != nil {
186+
log.Error(err, "cache sync failed after BootState change")
187+
}
188+
}
189+
190+
readyReason := conditions.GetReason(machineScope.HCloudMachine, clusterv1.ReadyCondition)
191+
readyMessage := conditions.GetMessage(machineScope.HCloudMachine, clusterv1.ReadyCondition)
192+
193+
duration := time.Since(startReconcile)
194+
195+
if duration > 5*time.Second {
196+
log.Info("Reconcile took too long",
197+
"reconcileDuration", duration,
198+
"res", res,
199+
"reterr", reterr,
200+
"oldState", initialHCloudMachine.Status.BootState,
201+
"newState", machineScope.HCloudMachine.Status.BootState,
202+
"readyReason", readyReason,
203+
"readyMessage", readyMessage,
204+
)
205+
}
206+
207+
if initialHCloudMachine.Status.BootState != machineScope.HCloudMachine.Status.BootState {
208+
startBootState := initialHCloudMachine.Status.BootStateSince
209+
if startBootState.IsZero() {
210+
startBootState = initialHCloudMachine.CreationTimestamp
211+
}
212+
log.Info("BootState changed",
213+
"oldState", initialHCloudMachine.Status.BootState,
214+
"newState", machineScope.HCloudMachine.Status.BootState,
215+
"durationInState", machineScope.HCloudMachine.Status.BootStateSince.Time.Sub(startBootState.Time),
216+
"readyReason", readyReason,
217+
"readyMessage", readyMessage,
218+
)
219+
}
160220
}()
161221

162222
// Check whether rate limit has been reached and if so, then wait.
@@ -168,6 +228,10 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
168228
return r.reconcileDelete(ctx, machineScope)
169229
}
170230

231+
if hcloudMachine.Status.FailureReason != nil {
232+
// This machine will be removed.
233+
return reconcile.Result{}, nil
234+
}
171235
return r.reconcileNormal(ctx, machineScope)
172236
}
173237

@@ -460,6 +524,13 @@ func IgnoreInsignificantHCloudMachineStatusUpdates(logger logr.Logger) predicate
460524
oldHCloudMachine.ResourceVersion = ""
461525
newHCloudMachine.ResourceVersion = ""
462526

527+
// The ProviderID is set by the controller. Do not react if that changes.
528+
// Otherwise the next Reconcile is likely to read outdated data, because
529+
// the Status was not updated yet. PatchHelper updates three times in this order:
530+
// Status.Conditions, Resource, Status.
531+
oldHCloudMachine.Spec.ProviderID = nil
532+
newHCloudMachine.Spec.ProviderID = nil
533+
463534
oldHCloudMachine.Status = infrav1.HCloudMachineStatus{}
464535
newHCloudMachine.Status = infrav1.HCloudMachineStatus{}
465536

controllers/hcloudmachine_controller_test.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ var _ = Describe("HCloudMachineReconciler", func() {
365365
}, timeout).Should(BeTrue())
366366
})
367367

368-
It("creates the HCloud machine in Hetzner 1", func() {
368+
It("creates the HCloud machine in Hetzner 1 (flaky)", func() {
369369
By("checking that no servers exist")
370370

371371
Eventually(func() bool {
@@ -433,6 +433,24 @@ var _ = Describe("HCloudMachineReconciler", func() {
433433
Eventually(func() bool {
434434
return isPresentAndTrue(key, hcloudMachine, infrav1.ServerAvailableCondition)
435435
}, timeout, interval).Should(BeTrue())
436+
437+
By("checking if the BootState is now OperatingSystemRunning")
438+
Eventually(func() bool {
439+
if err = testEnv.Get(ctx, key, hcloudMachine); err != nil {
440+
return false
441+
}
442+
443+
return hcloudMachine.Status.BootState == infrav1.HCloudBootStateOperatingSystemRunning && !hcloudMachine.Status.BootStateSince.IsZero()
444+
}, timeout, interval).Should(BeTrue())
445+
446+
By("checking if the ssh keys are set in the status")
447+
Eventually(func() bool {
448+
if err = testEnv.Get(ctx, key, hcloudMachine); err != nil {
449+
return false
450+
}
451+
452+
return len(hcloudMachine.Status.SSHKeys) == 1 && hcloudMachine.Status.SSHKeys[0].Name == "testsshkey"
453+
}, timeout, interval).Should(BeTrue())
436454
})
437455
})
438456

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ require (
77
github.com/bramvdbogaerde/go-scp v1.5.0
88
github.com/go-logr/logr v1.4.2
99
github.com/go-logr/zapr v1.3.0
10+
github.com/google/go-cmp v0.7.0
1011
github.com/guettli/check-conditions v0.0.20
1112
github.com/hetznercloud/hcloud-go/v2 v2.19.1
1213
github.com/mitchellh/copystructure v1.2.0
@@ -72,7 +73,6 @@ require (
7273
github.com/golang/protobuf v1.5.4 // indirect
7374
github.com/google/cel-go v0.17.8 // indirect
7475
github.com/google/gnostic-models v0.6.8 // indirect
75-
github.com/google/go-cmp v0.7.0 // indirect
7676
github.com/google/go-github/v53 v53.2.0 // indirect
7777
github.com/google/go-querystring v1.1.0 // indirect
7878
github.com/google/gofuzz v1.2.0 // indirect

0 commit comments

Comments
 (0)