Skip to content

Commit fc4946c

Browse files
authored
Merge pull request #2685 from okozachenko1203/release-0.11-fix-panic-on-server-deletion
🐛 [release-0.11] fix: fix panic on server deletion
2 parents 20bcb0f + 46194f2 commit fc4946c

File tree

7 files changed

+285
-19
lines changed

7 files changed

+285
-19
lines changed

api/v1beta1/conditions_consts.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ const (
4444
OpenStackErrorReason = "OpenStackError"
4545
// DependencyFailedReason indicates that a dependent object failed.
4646
DependencyFailedReason = "DependencyFailed"
47+
48+
// ServerUnexpectedDeletedMessage is the message used when the server is unexpectedly deleted via an external agent.
49+
ServerUnexpectedDeletedMessage = "The server was unexpectedly deleted"
4750
)
4851

4952
const (

controllers/openstackmachine_controller.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,12 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
377377
return ctrl.Result{}, err
378378
}
379379

380+
if instanceStatus == nil {
381+
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeletedReason, clusterv1.ConditionSeverityError, infrav1.ServerUnexpectedDeletedMessage)
382+
openStackMachine.SetFailure(capierrors.UpdateMachineError, errors.New(infrav1.ServerUnexpectedDeletedMessage)) //nolint:stylecheck // This error is not used as an error
383+
return ctrl.Result{}, nil
384+
}
385+
380386
instanceNS, err := instanceStatus.NetworkStatus()
381387
if err != nil {
382388
return ctrl.Result{}, fmt.Errorf("get network status: %w", err)
@@ -393,22 +399,20 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
393399
})
394400
openStackMachine.Status.Addresses = addresses
395401

396-
result := r.reconcileMachineState(scope, openStackMachine, machine, machineServer)
397-
if result != nil {
398-
return *result, nil
399-
}
402+
if util.IsControlPlaneMachine(machine) {
403+
err = r.reconcileAPIServerLoadBalancer(scope, openStackCluster, openStackMachine, instanceStatus, instanceNS, clusterResourceName)
404+
if err != nil {
405+
return ctrl.Result{}, err
406+
}
400407

401-
if !util.IsControlPlaneMachine(machine) {
402-
scope.Logger().Info("Not a Control plane machine, no floating ip reconcile needed, Reconciled Machine create successfully")
403-
return ctrl.Result{}, nil
408+
conditions.MarkTrue(openStackMachine, infrav1.APIServerIngressReadyCondition)
404409
}
405410

406-
err = r.reconcileAPIServerLoadBalancer(scope, openStackCluster, openStackMachine, instanceStatus, instanceNS, clusterResourceName)
407-
if err != nil {
408-
return ctrl.Result{}, err
411+
result := r.reconcileMachineState(scope, openStackMachine, machine, machineServer)
412+
if result != nil {
413+
return *result, nil
409414
}
410415

411-
conditions.MarkTrue(openStackMachine, infrav1.APIServerIngressReadyCondition)
412416
scope.Logger().Info("Reconciled Machine create successfully")
413417
return ctrl.Result{}, nil
414418
}

controllers/openstackserver_controller.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -433,9 +433,18 @@ func (r *OpenStackServerReconciler) getOrCreateServer(ctx context.Context, logge
433433

434434
if openStackServer.Status.InstanceID != nil {
435435
instanceStatus, err = computeService.GetInstanceStatus(*openStackServer.Status.InstanceID)
436-
if err != nil {
437-
logger.Info("Unable to get OpenStack instance", "name", openStackServer.Name)
438-
conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, "%s", err.Error())
436+
if err != nil || instanceStatus == nil {
437+
logger.Info("Unable to get OpenStack instance", "name", openStackServer.Name, "id", *openStackServer.Status.InstanceID)
438+
var msg string
439+
var reason string
440+
if err != nil {
441+
msg = err.Error()
442+
reason = infrav1.OpenStackErrorReason
443+
} else {
444+
msg = infrav1.ServerUnexpectedDeletedMessage
445+
reason = infrav1.InstanceNotFoundReason
446+
}
447+
conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, reason, clusterv1.ConditionSeverityError, msg)
439448
return nil, err
440449
}
441450
}
@@ -450,11 +459,6 @@ func (r *OpenStackServerReconciler) getOrCreateServer(ctx context.Context, logge
450459
logger.Info("Server already exists", "name", openStackServer.Name, "id", instanceStatus.ID())
451460
return instanceStatus, nil
452461
}
453-
if openStackServer.Status.InstanceID != nil {
454-
logger.Info("Not reconciling server in failed state. The previously existing OpenStack instance is no longer available")
455-
conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, infrav1.InstanceNotFoundReason, clusterv1.ConditionSeverityError, "virtual machine no longer exists")
456-
return nil, nil
457-
}
458462

459463
logger.Info("Server does not exist, creating Server", "name", openStackServer.Name)
460464
instanceSpec, err := r.serverToInstanceSpec(ctx, openStackServer)
@@ -468,6 +472,8 @@ func (r *OpenStackServerReconciler) getOrCreateServer(ctx context.Context, logge
468472
openStackServer.Status.InstanceState = &infrav1.InstanceStateError
469473
return nil, fmt.Errorf("create OpenStack instance: %w", err)
470474
}
475+
// We reached a point where a server was created with no error but we can't predict its state yet which is why we don't update conditions yet.
476+
// The actual state of the server is checked in the next reconcile loops.
471477
return instanceStatus, nil
472478
}
473479
return instanceStatus, nil

controllers/openstackserver_controller_test.go

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ import (
3131
"github.com/gophercloud/gophercloud/v2/openstack/networking/v2/ports"
3232
. "github.com/onsi/gomega" //nolint:revive
3333
"go.uber.org/mock/gomock"
34+
corev1 "k8s.io/api/core/v1"
3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3536
"k8s.io/utils/ptr"
37+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
38+
"sigs.k8s.io/cluster-api/util/conditions"
3639

3740
infrav1alpha1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha1"
3841
infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1beta1"
@@ -548,3 +551,184 @@ func Test_OpenStackServerReconcileCreate(t *testing.T) {
548551
})
549552
}
550553
}
554+
555+
func TestOpenStackServerReconciler_getOrCreateServer(t *testing.T) {
556+
tests := []struct {
557+
name string
558+
openStackServer *infrav1alpha1.OpenStackServer
559+
setupMocks func(r *recorders)
560+
wantServer *servers.Server
561+
wantErr bool
562+
wantCondition *clusterv1.Condition
563+
}{
564+
{
565+
name: "instanceID set in status but server not found",
566+
openStackServer: &infrav1alpha1.OpenStackServer{
567+
Status: infrav1alpha1.OpenStackServerStatus{
568+
InstanceID: ptr.To(instanceUUID),
569+
},
570+
},
571+
setupMocks: func(r *recorders) {
572+
r.compute.GetServer(instanceUUID).Return(nil, gophercloud.ErrUnexpectedResponseCode{Actual: 404})
573+
},
574+
wantErr: false,
575+
wantCondition: &clusterv1.Condition{
576+
Type: infrav1.InstanceReadyCondition,
577+
Status: corev1.ConditionFalse,
578+
Reason: infrav1.InstanceNotFoundReason,
579+
Message: infrav1.ServerUnexpectedDeletedMessage,
580+
},
581+
},
582+
{
583+
name: "instanceID set in status but server not found with error",
584+
openStackServer: &infrav1alpha1.OpenStackServer{
585+
Status: infrav1alpha1.OpenStackServerStatus{
586+
InstanceID: ptr.To(instanceUUID),
587+
},
588+
},
589+
setupMocks: func(r *recorders) {
590+
r.compute.GetServer(instanceUUID).Return(nil, fmt.Errorf("error"))
591+
},
592+
wantErr: true,
593+
wantCondition: &clusterv1.Condition{
594+
Type: infrav1.InstanceReadyCondition,
595+
Status: corev1.ConditionFalse,
596+
Reason: infrav1.OpenStackErrorReason,
597+
Message: "get server \"" + instanceUUID + "\" detail failed: error",
598+
},
599+
},
600+
{
601+
name: "instanceStatus is nil but server found with machine name",
602+
openStackServer: &infrav1alpha1.OpenStackServer{
603+
ObjectMeta: metav1.ObjectMeta{
604+
Name: openStackServerName,
605+
},
606+
Status: infrav1alpha1.OpenStackServerStatus{},
607+
},
608+
setupMocks: func(r *recorders) {
609+
r.compute.ListServers(servers.ListOpts{
610+
Name: "^" + openStackServerName + "$",
611+
}).Return([]servers.Server{{ID: instanceUUID}}, nil)
612+
},
613+
wantErr: false,
614+
wantServer: &servers.Server{
615+
ID: instanceUUID,
616+
},
617+
},
618+
{
619+
name: "instanceStatus is nil and server not found and then created",
620+
openStackServer: &infrav1alpha1.OpenStackServer{
621+
ObjectMeta: metav1.ObjectMeta{
622+
Name: openStackServerName,
623+
},
624+
Status: infrav1alpha1.OpenStackServerStatus{
625+
Resolved: &infrav1alpha1.ResolvedServerSpec{
626+
ImageID: imageUUID,
627+
FlavorID: flavorUUID,
628+
Ports: defaultResolvedPorts,
629+
},
630+
},
631+
},
632+
setupMocks: func(r *recorders) {
633+
r.compute.ListServers(servers.ListOpts{
634+
Name: "^" + openStackServerName + "$",
635+
}).Return([]servers.Server{}, nil)
636+
r.compute.CreateServer(gomock.Any(), gomock.Any()).Return(&servers.Server{ID: instanceUUID}, nil)
637+
},
638+
wantErr: false,
639+
wantServer: &servers.Server{
640+
ID: instanceUUID,
641+
},
642+
// It's off but no condition is set because the server creation was kicked off but we
643+
// don't know the result yet in this function.
644+
},
645+
{
646+
name: "instanceStatus is nil and server not found and then created with error",
647+
openStackServer: &infrav1alpha1.OpenStackServer{
648+
ObjectMeta: metav1.ObjectMeta{
649+
Name: openStackServerName,
650+
},
651+
Status: infrav1alpha1.OpenStackServerStatus{
652+
Resolved: &infrav1alpha1.ResolvedServerSpec{
653+
ImageID: imageUUID,
654+
FlavorID: flavorUUID,
655+
Ports: defaultResolvedPorts,
656+
},
657+
},
658+
},
659+
setupMocks: func(r *recorders) {
660+
r.compute.ListServers(servers.ListOpts{
661+
Name: "^" + openStackServerName + "$",
662+
}).Return([]servers.Server{}, nil)
663+
r.compute.CreateServer(gomock.Any(), gomock.Any()).Return(nil, fmt.Errorf("error"))
664+
},
665+
wantErr: true,
666+
wantCondition: &clusterv1.Condition{
667+
Type: infrav1.InstanceReadyCondition,
668+
Status: corev1.ConditionFalse,
669+
Reason: infrav1.InstanceCreateFailedReason,
670+
Message: "error creating Openstack instance: " + "error",
671+
},
672+
},
673+
}
674+
675+
for _, tt := range tests {
676+
t.Run(tt.name, func(t *testing.T) {
677+
g := NewGomegaWithT(t)
678+
log := testr.New(t)
679+
680+
mockCtrl := gomock.NewController(t)
681+
defer mockCtrl.Finish()
682+
683+
mockScopeFactory := scope.NewMockScopeFactory(mockCtrl, "")
684+
scopeWithLogger := scope.NewWithLogger(mockScopeFactory, log)
685+
686+
computeRecorder := mockScopeFactory.ComputeClient.EXPECT()
687+
imageRecorder := mockScopeFactory.ImageClient.EXPECT()
688+
networkRecorder := mockScopeFactory.NetworkClient.EXPECT()
689+
volumeRecorder := mockScopeFactory.VolumeClient.EXPECT()
690+
691+
recorders := &recorders{
692+
compute: computeRecorder,
693+
image: imageRecorder,
694+
network: networkRecorder,
695+
volume: volumeRecorder,
696+
}
697+
698+
if tt.setupMocks != nil {
699+
tt.setupMocks(recorders)
700+
}
701+
702+
computeService, err := compute.NewService(scopeWithLogger)
703+
g.Expect(err).ToNot(HaveOccurred())
704+
705+
reconciler := OpenStackServerReconciler{}
706+
status, err := reconciler.getOrCreateServer(ctx, log, tt.openStackServer, computeService, []string{portUUID})
707+
708+
// Check error result
709+
if tt.wantErr {
710+
g.Expect(err).To(HaveOccurred())
711+
} else {
712+
g.Expect(err).ToNot(HaveOccurred())
713+
}
714+
715+
// Check instance status
716+
if tt.wantServer != nil {
717+
g.Expect(status.ID()).To(Equal(tt.wantServer.ID))
718+
}
719+
720+
// Check the condition is set correctly
721+
if tt.wantCondition != nil {
722+
// print openstackServer conditions
723+
for _, condition := range tt.openStackServer.Status.Conditions {
724+
t.Logf("Condition: %s, Status: %s, Reason: %s", condition.Type, condition.Status, condition.Reason)
725+
}
726+
conditionType := conditions.Get(tt.openStackServer, tt.wantCondition.Type)
727+
g.Expect(conditionType).ToNot(BeNil())
728+
g.Expect(conditionType.Status).To(Equal(tt.wantCondition.Status))
729+
g.Expect(conditionType.Reason).To(Equal(tt.wantCondition.Reason))
730+
g.Expect(conditionType.Message).To(Equal(tt.wantCondition.Message))
731+
}
732+
})
733+
}
734+
}

test/e2e/data/e2e_conf.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ intervals:
188188
default/wait-control-plane: ["30m", "10s"]
189189
default/wait-worker-nodes: ["30m", "10s"]
190190
default/wait-delete-cluster: ["5m", "10s"]
191+
default/wait-delete-machine: ["30m", "10s"]
191192
default/wait-alt-az: ["20m", "30s"]
192193
default/wait-machine-upgrade: ["30m", "10s"]
193194
default/wait-nodes-ready: ["15m", "10s"]

test/e2e/shared/openstack.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,21 @@ func CreateOpenStackNetwork(e2eCtx *E2EContext, name, cidr string) (*networks.Ne
841841
return net, nil
842842
}
843843

844+
func DeleteOpenStackServer(ctx context.Context, e2eCtx *E2EContext, id string) error {
845+
providerClient, clientOpts, _, err := GetTenantProviderClient(e2eCtx)
846+
if err != nil {
847+
_, _ = fmt.Fprintf(GinkgoWriter, "error creating provider client: %s\n", err)
848+
return err
849+
}
850+
851+
computeClient, err := openstack.NewComputeV2(providerClient, gophercloud.EndpointOpts{Region: clientOpts.RegionName})
852+
if err != nil {
853+
return fmt.Errorf("error creating compute client: %s", err)
854+
}
855+
856+
return servers.Delete(ctx, computeClient, id).ExtractErr()
857+
}
858+
844859
func DeleteOpenStackNetwork(ctx context.Context, e2eCtx *E2EContext, id string) error {
845860
providerClient, clientOpts, _, err := GetTenantProviderClient(e2eCtx)
846861
if err != nil {

test/e2e/suites/e2e/e2e_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@ import (
5555
"sigs.k8s.io/cluster-api/test/framework/clusterctl"
5656
crclient "sigs.k8s.io/controller-runtime/pkg/client"
5757

58+
"github.com/k-orc/openstack-resource-controller/pkg/utils/ssa"
59+
5860
infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1beta1"
61+
"sigs.k8s.io/cluster-api-provider-openstack/pkg/generated/applyconfiguration/api/v1beta1"
5962
"sigs.k8s.io/cluster-api-provider-openstack/test/e2e/shared"
6063
)
6164

@@ -1006,6 +1009,56 @@ var _ = Describe("e2e tests [PR-Blocking]", func() {
10061009
Expect(additionalVolume.AvailabilityZone).To(Equal(failureDomain))
10071010
Expect(additionalVolume.VolumeType).To(Equal(volumeTypeAlt))
10081011
}
1012+
1013+
// This last block tests a scenario where an external agent deletes a server (e.g. a user via Horizon).
1014+
// We want to ensure that the OpenStackMachine conditions are updated to reflect the server deletion.
1015+
// Context: https://github.com/kubernetes-sigs/cluster-api-provider-openstack/issues/2474
1016+
shared.Logf("Deleting a server")
1017+
serverToDelete := allServers[controlPlaneMachines[0].Spec.InfrastructureRef.Name]
1018+
err = shared.DeleteOpenStackServer(ctx, e2eCtx, serverToDelete.ID)
1019+
Expect(err).NotTo(HaveOccurred())
1020+
1021+
shared.Logf("Waiting for the OpenStackMachine to have a condition that the server has been unexpectedly deleted")
1022+
retries := 0
1023+
Eventually(func() (clusterv1.Condition, error) {
1024+
k8sClient := e2eCtx.Environment.BootstrapClusterProxy.GetClient()
1025+
1026+
openStackMachine := &infrav1.OpenStackMachine{}
1027+
err := k8sClient.Get(ctx, crclient.ObjectKey{Name: controlPlaneMachines[0].Name, Namespace: controlPlaneMachines[0].Namespace}, openStackMachine)
1028+
if err != nil {
1029+
return clusterv1.Condition{}, err
1030+
}
1031+
for _, condition := range openStackMachine.Status.Conditions {
1032+
if condition.Type == infrav1.InstanceReadyCondition {
1033+
return condition, nil
1034+
}
1035+
}
1036+
1037+
// Make some non-functional change to the object which will
1038+
// cause CAPO to reconcile it, otherwise we won't notice the
1039+
// server is gone until the configured controller-runtime
1040+
// resync.
1041+
retries++
1042+
applyConfig := v1beta1.OpenStackMachine(openStackMachine.Name, openStackMachine.Namespace).
1043+
WithAnnotations(map[string]string{
1044+
"e2e-test-retries": fmt.Sprintf("%d", retries),
1045+
})
1046+
err = k8sClient.Patch(ctx, openStackMachine, ssa.ApplyConfigPatch(applyConfig), crclient.ForceOwnership, crclient.FieldOwner("capo-e2e"))
1047+
if err != nil {
1048+
return clusterv1.Condition{}, err
1049+
}
1050+
1051+
return clusterv1.Condition{}, errors.New("condition InstanceReadyCondition not found")
1052+
}, time.Minute*3, time.Second*10).Should(MatchFields(
1053+
IgnoreExtras,
1054+
Fields{
1055+
"Type": Equal(infrav1.InstanceReadyCondition),
1056+
"Status": Equal(corev1.ConditionFalse),
1057+
"Reason": Equal(infrav1.InstanceDeletedReason),
1058+
"Message": Equal(infrav1.ServerUnexpectedDeletedMessage),
1059+
"Severity": Equal(clusterv1.ConditionSeverityError),
1060+
},
1061+
), "OpenStackMachine should be marked not ready with InstanceDeletedReason")
10091062
})
10101063
})
10111064
})

0 commit comments

Comments
 (0)