@@ -26,25 +26,24 @@ import (
2626
2727 "github.com/gophercloud/gophercloud/v2"
2828 "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates"
29+ "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/hypervisors"
2930 "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/services"
3031 "github.com/gophercloud/gophercloud/v2/openstack/placement/v1/resourceproviders"
31- corev1 "k8s.io/api/core/v1"
3232 "k8s.io/apimachinery/pkg/api/meta"
3333 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3434 "k8s.io/apimachinery/pkg/runtime"
3535 "k8s.io/client-go/util/retry"
3636 ctrl "sigs.k8s.io/controller-runtime"
3737 k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
38- "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3938 logger "sigs.k8s.io/controller-runtime/pkg/log"
4039
4140 kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
4241 "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/openstack"
42+ "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils"
4343)
4444
4545const (
46- decommissionFinalizerName = "cobaltcore.cloud.sap/decommission-hypervisor"
47- DecommissionControllerName = "nodeDecommission"
46+ DecommissionControllerName = "decommission"
4847)
4948
5049type NodeDecommissionReconciler struct {
@@ -57,145 +56,108 @@ type NodeDecommissionReconciler struct {
5756// The counter-side in gardener is here:
5857// https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L646
5958
60- // +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;patch;update
61- // +kubebuilder:rbac:groups="",resources=nodes/finalizers,verbs=update
6259// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch
6360// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;update;patch
61+
6462func (r * NodeDecommissionReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
65- hostname := req .Name
66- log := logger .FromContext (ctx ).WithName (req .Name ).WithValues ("hostname" , hostname )
67- ctx = logger .IntoContext (ctx , log )
63+ log := logger .FromContext (ctx ).WithName (req .Name )
64+ hv := & kvmv1.Hypervisor {}
65+ if err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
66+ if err := r .Get (ctx , req .NamespacedName , hv ); err != nil {
67+ // ignore not found errors, could be deleted
68+ return k8sclient .IgnoreNotFound (err )
69+ }
6870
69- node := & corev1.Node {}
70- if err := r .Get (ctx , req .NamespacedName , node ); err != nil {
71- return ctrl.Result {}, k8sclient .IgnoreNotFound (err )
72- }
71+ setDecommissioningCondition := func (msg string ) {
72+ meta .SetStatusCondition (& hv .Status .Conditions , metav1.Condition {
73+ Type : kvmv1 .ConditionTypeReady ,
74+ Status : metav1 .ConditionFalse ,
75+ Reason : kvmv1 .ConditionReasonDecommissioning ,
76+ Message : msg ,
77+ })
78+ }
7379
74- // Fetch HV to check if lifecycle management is enabled
75- hv := & kvmv1.Hypervisor {}
76- if err := r .Get (ctx , k8sclient.ObjectKey {Name : hostname }, hv ); err != nil {
77- // ignore not found errors, could be deleted
78- return ctrl.Result {}, k8sclient .IgnoreNotFound (err )
79- }
80- if ! hv .Spec .LifecycleEnabled {
81- // Get out of the way
82- return r .removeFinalizer (ctx , node )
83- }
80+ if meta .IsStatusConditionTrue (hv .Status .Conditions , kvmv1 .ConditionTypeReady ) {
81+ setDecommissioningCondition ("Node is being decommissioned, removing host from nova" )
82+ return r .Status ().Update (ctx , hv )
83+ }
8484
85- if ! controllerutil .ContainsFinalizer (node , decommissionFinalizerName ) {
86- return ctrl.Result {}, retry .RetryOnConflict (retry .DefaultRetry , func () error {
87- patch := k8sclient .MergeFrom (node .DeepCopy ())
88- controllerutil .AddFinalizer (node , decommissionFinalizerName )
89- if err := r .Patch (ctx , node , patch ); err != nil {
90- return fmt .Errorf ("failed to add finalizer due to %w" , err )
85+ hypervisor , err := openstack .GetHypervisorByName (ctx , r .computeClient , hv .Name , true )
86+ if err != nil {
87+ if errors .Is (err , openstack .ErrNoHypervisor ) {
88+ // We are (hopefully) done
89+ setDecommissioningCondition ("Node not registered in nova anymore, proceeding with deletion" )
90+ hv .Status .Evicted = true
91+ return r .Status ().Update (ctx , hv )
9192 }
92- log .Info ("Added finalizer" )
93- return nil
94- })
95- }
9693
97- // Not yet deleting hv, nothing more to do
98- if node .DeletionTimestamp .IsZero () {
99- return ctrl.Result {}, nil
100- }
94+ setDecommissioningCondition (fmt .Sprintf ("Failed to get %q from openstack: %v" , hv .Name , err ))
95+ return r .Status ().Update (ctx , hv )
96+ }
10197
102- // Someone is just deleting the hv, without going through termination
103- // See: https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L658-L659
104- if ! IsNodeConditionTrue (node .Status .Conditions , "Terminating" ) {
105- log .Info ("removing finalizer since not terminating" )
106- // So we just get out of the way for now
107- return r .removeFinalizer (ctx , node )
108- }
98+ if err = r .doDecomission (ctx , hv , hypervisor ); err != nil {
99+ log .Error (err , "Failed to decomission node" , "node" , hv .Name )
100+ setDecommissioningCondition (err .Error ())
101+ return r .Status ().Update (ctx , hv )
102+ }
109103
110- if meta .IsStatusConditionTrue (hv .Status .Conditions , kvmv1 .ConditionTypeReady ) {
111- return r .setDecommissioningCondition (ctx , hv , "Node is being decommissioned, removing host from nova" )
104+ // Decommissioning succeeded, proceed with deletion
105+ hv .Status .Evicted = true
106+ return r .Status ().Update (ctx , hv )
107+ }); err != nil {
108+ return ctrl.Result {}, err
112109 }
113110
114- log .Info ("removing host from nova" )
115-
116- hypervisor , err := openstack .GetHypervisorByName (ctx , r .computeClient , hostname , true )
117- if errors .Is (err , openstack .ErrNoHypervisor ) {
118- // We are (hopefully) done
119- return r .removeFinalizer (ctx , node )
120- }
111+ return ctrl.Result {RequeueAfter : utils .ShortRetryTime }, nil
112+ }
121113
114+ func (r * NodeDecommissionReconciler ) doDecomission (ctx context.Context , hv * kvmv1.Hypervisor , hypervisor * hypervisors.Hypervisor ) error {
122115 // TODO: remove since RunningVMs is only available until micro-version 2.87, and also is updated asynchronously
123116 // so it might be not accurate
124117 if hypervisor .RunningVMs > 0 {
125118 // Still running VMs, cannot delete the service
126- msg := fmt .Sprintf ("Node is being decommissioned, but still has %d running VMs" , hypervisor .RunningVMs )
127- return r .setDecommissioningCondition (ctx , hv , msg )
119+ return fmt .Errorf ("node is being decommissioned, but still has %d running VMs" , hypervisor .RunningVMs )
128120 }
129121
130122 if hypervisor .Servers != nil && len (* hypervisor .Servers ) > 0 {
131123 // Still VMs assigned to the host, cannot delete the service
132- msg := fmt .Sprintf ("Node is being decommissioned, but still has %d assigned VMs, " +
133- "check with `openstack server list --all-projects --host %s`" , len (* hypervisor .Servers ), hostname )
134- return r .setDecommissioningCondition (ctx , hv , msg )
124+ return fmt .Errorf ("node is being decommissioned, but still has %d assigned VMs, " +
125+ "check with `openstack server list --all-projects --host %s`" , len (* hypervisor .Servers ), hv .Name )
135126 }
136127
137- // Before removing the service, first take the node out of the aggregates,
138- // so when the node comes back, it doesn't up with the old associations
139- aggs , err := aggregatesByName (ctx , r .computeClient )
128+ // Before removing the service, first take the hypervisor out of the aggregates,
129+ // so when the hypervisor comes back, it doesn't up with the old associations
130+ aggs , err := openstack . GetAggregatesByName (ctx , r .computeClient )
140131 if err != nil {
141- return r . setDecommissioningCondition ( ctx , hv , fmt .Sprintf ("cannot list aggregates due to %v " , err ) )
132+ return fmt .Errorf ("cannot list aggregates due to: %w " , err )
142133 }
143134
144- host := node .Name
135+ host := hv .Name
145136 for name , aggregate := range aggs {
146137 if slices .Contains (aggregate .Hosts , host ) {
147138 opts := aggregates.RemoveHostOpts {Host : host }
148139 if err = aggregates .RemoveHost (ctx , r .computeClient , aggregate .ID , opts ).Err ; err != nil {
149- msg := fmt .Sprintf ("failed to remove host %v from aggregate %v due to %v" , name , host , err )
150- return r .setDecommissioningCondition (ctx , hv , msg )
140+ return fmt .Errorf ("failed to remove host %v from aggregate %v due to %w" , name , host , err )
151141 }
152142 }
153143 }
154144
155145 // Deleting and evicted, so better delete the service
156146 err = services .Delete (ctx , r .computeClient , hypervisor .Service .ID ).ExtractErr ()
157147 if err != nil && ! gophercloud .ResponseCodeIs (err , http .StatusNotFound ) {
158- msg := fmt .Sprintf ("cannot delete service %s due to %v" , hypervisor .Service .ID , err )
159- return r .setDecommissioningCondition (ctx , hv , msg )
148+ return fmt .Errorf ("cannot delete service %s due to %w" , hypervisor .Service .ID , err )
160149 }
161150
162151 rp , err := resourceproviders .Get (ctx , r .placementClient , hypervisor .ID ).Extract ()
163152 if err != nil && ! gophercloud .ResponseCodeIs (err , http .StatusNotFound ) {
164- return r . setDecommissioningCondition ( ctx , hv , fmt .Sprintf ("cannot get resource provider: %v " , err ) )
153+ return fmt .Errorf ("cannot get resource provider %s due to %w " , hypervisor . ID , err )
165154 }
166155
167156 if err = openstack .CleanupResourceProvider (ctx , r .placementClient , rp ); err != nil {
168- return r .setDecommissioningCondition (ctx , hv , fmt .Sprintf ("cannot clean up resource provider: %v" , err ))
169- }
170-
171- return r .removeFinalizer (ctx , node )
172- }
173-
174- func (r * NodeDecommissionReconciler ) removeFinalizer (ctx context.Context , node * corev1.Node ) (ctrl.Result , error ) {
175- if ! controllerutil .ContainsFinalizer (node , decommissionFinalizerName ) {
176- return ctrl.Result {}, nil
157+ return fmt .Errorf ("cannot cleanup resource provider: %w" , err )
177158 }
178159
179- nodeBase := node .DeepCopy ()
180- controllerutil .RemoveFinalizer (node , decommissionFinalizerName )
181- err := r .Patch (ctx , node , k8sclient .MergeFromWithOptions (nodeBase ,
182- k8sclient.MergeFromWithOptimisticLock {}), k8sclient .FieldOwner (DecommissionControllerName ))
183- return ctrl.Result {}, err
184- }
185-
186- func (r * NodeDecommissionReconciler ) setDecommissioningCondition (ctx context.Context , hv * kvmv1.Hypervisor , message string ) (ctrl.Result , error ) {
187- base := hv .DeepCopy ()
188- meta .SetStatusCondition (& hv .Status .Conditions , metav1.Condition {
189- Type : kvmv1 .ConditionTypeReady ,
190- Status : metav1 .ConditionFalse ,
191- Reason : "Decommissioning" ,
192- Message : message ,
193- })
194- if err := r .Status ().Patch (ctx , hv , k8sclient .MergeFromWithOptions (base ,
195- k8sclient.MergeFromWithOptimisticLock {}), k8sclient .FieldOwner (DecommissionControllerName )); err != nil {
196- return ctrl.Result {}, fmt .Errorf ("cannot update hypervisor status due to %w" , err )
197- }
198- return ctrl.Result {RequeueAfter : shortRetryTime }, nil
160+ return nil
199161}
200162
201163// SetupWithManager sets up the controller with the Manager.
@@ -217,6 +179,8 @@ func (r *NodeDecommissionReconciler) SetupWithManager(mgr ctrl.Manager) error {
217179
218180 return ctrl .NewControllerManagedBy (mgr ).
219181 Named (DecommissionControllerName ).
220- For (& corev1.Node {}).
182+ For (& kvmv1.Hypervisor {}).
183+ WithEventFilter (utils .HypervisorTerminationPredicate ).
184+ WithEventFilter (utils .LifecycleEnabledPredicate ).
221185 Complete (r )
222186}
0 commit comments