From 47154eaa67acd0d99630f91dcdc57c6c3f6b6b5b Mon Sep 17 00:00:00 2001 From: Sven Haardiek Date: Fri, 14 Nov 2025 14:27:36 +0100 Subject: [PATCH] Shutdown VMs before Deletion Tries to shutdown the OpenStack VM before deleting it. This way even Pods form Daemonsets are shut down more gracefully and services like license daemons on the VMs can be properly shutdown. Related to kubernetes-sigs#1973 --- api/v1beta1/conditions_consts.go | 2 ++ controllers/openstackserver_controller.go | 8 ++++++ pkg/clients/compute.go | 11 ++++++++ pkg/clients/mock/compute.go | 14 ++++++++++ pkg/cloud/services/compute/instance.go | 32 +++++++++++++++++++++++ 5 files changed, 67 insertions(+) diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index d84500772f..dfa9190029 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -38,6 +38,8 @@ const ( InstanceDeletedReason = "InstanceDeleted" // InstanceNotReadyReason used when the instance is in a pending state. InstanceNotReadyReason = "InstanceNotReady" + // InstanceStopFailedReason used when stopping the instance failed. + InstanceStopFailedReason = "InstanceStopFailed" // InstanceDeleteFailedReason used when deleting the instance failed. InstanceDeleteFailedReason = "InstanceDeleteFailed" // OpenstackErrorReason used when there is an error communicating with OpenStack. diff --git a/controllers/openstackserver_controller.go b/controllers/openstackserver_controller.go index 26494bfa49..6afd74158b 100644 --- a/controllers/openstackserver_controller.go +++ b/controllers/openstackserver_controller.go @@ -257,6 +257,14 @@ func (r *OpenStackServerReconciler) reconcileDelete(scope *scope.WithLogger, ope return fmt.Errorf("delete volumes: %w", err) } } else { + // Try to stop the VM, if active before deletion + if instanceStatus.State() == infrav1.InstanceStateActive { + if err := computeService.StopInstance(openStackServer, instanceStatus); err != nil { + v1beta1conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, infrav1.InstanceStopFailedReason, clusterv1beta1.ConditionSeverityError, "Stopping instance failed: %v", err) + return fmt.Errorf("stop instance: %w", err) + } + } + if err := computeService.DeleteInstance(openStackServer, instanceStatus); err != nil { v1beta1conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1beta1.ConditionSeverityError, "Deleting instance failed: %v", err) return fmt.Errorf("delete instance: %w", err) diff --git a/pkg/clients/compute.go b/pkg/clients/compute.go index 0338417b93..007c10f8aa 100644 --- a/pkg/clients/compute.go +++ b/pkg/clients/compute.go @@ -61,6 +61,7 @@ type ComputeClient interface { ListFlavors() ([]flavors.Flavor, error) CreateServer(createOpts servers.CreateOptsBuilder, schedulerHints servers.SchedulerHintOptsBuilder) (*servers.Server, error) DeleteServer(serverID string) error + StopServer(serverID string) error GetServer(serverID string) (*servers.Server, error) ListServers(listOpts servers.ListOptsBuilder) ([]servers.Server, error) @@ -141,6 +142,12 @@ func (c computeClient) DeleteServer(serverID string) error { return mc.ObserveRequestIgnoreNotFound(err) } +func (c computeClient) StopServer(serverID string) error { + mc := metrics.NewMetricPrometheusContext("server", "stop") + err := servers.Stop(context.TODO(), c.client, serverID).ExtractErr() + return mc.ObserveRequestIgnoreNotFound(err) +} + func (c computeClient) GetServer(serverID string) (*servers.Server, error) { var server servers.Server mc := metrics.NewMetricPrometheusContext("server", "get") @@ -230,6 +237,10 @@ func (e computeErrorClient) DeleteServer(_ string) error { return e.error } +func (e computeErrorClient) StopServer(_ string) error { + return e.error +} + func (e computeErrorClient) GetServer(_ string) (*servers.Server, error) { return nil, e.error } diff --git a/pkg/clients/mock/compute.go b/pkg/clients/mock/compute.go index c83d7a701e..cb67894790 100644 --- a/pkg/clients/mock/compute.go +++ b/pkg/clients/mock/compute.go @@ -208,6 +208,20 @@ func (mr *MockComputeClientMockRecorder) ListServers(listOpts any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListServers", reflect.TypeOf((*MockComputeClient)(nil).ListServers), listOpts) } +// StopServer mocks base method. +func (m *MockComputeClient) StopServer(serverID string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "StopServer", serverID) + ret0, _ := ret[0].(error) + return ret0 +} + +// StopServer indicates an expected call of StopServer. +func (mr *MockComputeClientMockRecorder) StopServer(serverID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StopServer", reflect.TypeOf((*MockComputeClient)(nil).StopServer), serverID) +} + // WithMicroversion mocks base method. func (m *MockComputeClient) WithMicroversion(required string) (clients.ComputeClient, error) { m.ctrl.T.Helper() diff --git a/pkg/cloud/services/compute/instance.go b/pkg/cloud/services/compute/instance.go index 255faf6d5c..189a3a631e 100644 --- a/pkg/cloud/services/compute/instance.go +++ b/pkg/cloud/services/compute/instance.go @@ -45,6 +45,7 @@ import ( const ( retryIntervalInstanceStatus = 10 * time.Second timeoutInstanceCreate = 5 + timeoutInstanceStop = 5 * time.Minute timeoutInstanceDelete = 5 * time.Minute ) @@ -560,6 +561,37 @@ func (s *Service) deleteVolume(instanceName string, nameSuffix string) error { return s.getVolumeClient().DeleteVolume(volume.ID, volumes.DeleteOpts{}) } +func (s *Service) StopInstance(eventObject runtime.Object, instanceStatus *InstanceStatus) error { + instance := instanceStatus.InstanceIdentifier() + err := s.getComputeClient().StopServer(instance.ID) + if err != nil { + if capoerrors.IsNotFound(err) { + record.Eventf(eventObject, "SuccessfulStopServer", "Server %s with id %s did not exist", instance.Name, instance.ID) + return nil + } + record.Warnf(eventObject, "FailedStopServer", "Failed to stop server %s with id %s: %v", instance.Name, instance.ID, err) + return err + } + err = wait.PollUntilContextTimeout(context.TODO(), retryIntervalInstanceStatus, timeoutInstanceStop, true, func(_ context.Context) (bool, error) { + i, err := s.GetInstanceStatus(instance.ID) + if err != nil { + return false, err + } + // a little bit noisy, but since the code is new, I guess this is a little bit easier for debugging purposes + record.Eventf(eventObject, "Server State", "server %s with id %s has state %v", instance.Name, instance.ID, i.State()) + if i.State() == infrav1.InstanceStateShutoff { + return true, nil + } + return false, nil + }) + if err != nil { + record.Warnf(eventObject, "FailedStopServer", "Failed to stop server %s with id %s: %v", instance.Name, instance.ID, err) + return err + } + record.Eventf(eventObject, "SuccessfulStopServer", "Stopped server %s with id %s", instance.Name, instance.ID) + return nil +} + func (s *Service) GetInstanceStatus(resourceID string) (instance *InstanceStatus, err error) { if resourceID == "" { return nil, fmt.Errorf("resourceId should be specified to get detail")