Skip to content

Commit 2f5551c

Browse files
committed
[decomission-controller] switch to maintenance-operator
this switches the decomissioning-controller to use the terminating state of maintenance spec field to decomission the node. Also, the maintenance-operator will take care of removing the finalizer - thus removing the need to access nodes at all.
1 parent d526639 commit 2f5551c

File tree

11 files changed

+216
-166
lines changed

11 files changed

+216
-166
lines changed

api/v1/hypervisor_types.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,11 @@ const (
5757
ConditionReasonReadyEvicted = "Evicted"
5858

5959
// ConditionTypeOnboarding reasons
60-
ConditionReasonInitial = "Initial"
61-
ConditionReasonOnboarding = "Onboarding"
62-
ConditionReasonTesting = "Testing"
63-
ConditionReasonAborted = "Aborted"
60+
ConditionReasonInitial = "Initial"
61+
ConditionReasonOnboarding = "Onboarding"
62+
ConditionReasonTesting = "Testing"
63+
ConditionReasonAborted = "Aborted"
64+
ConditionReasonDecomissioning = "Decomissioning"
6465
)
6566

6667
// HypervisorSpec defines the desired state of Hypervisor

charts/openstack-hypervisor-operator/crds/hypervisor-crd.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,14 @@ spec:
315315
firmwareVersion:
316316
description: FirmwareVersion
317317
type: string
318+
gardenLinuxCommitID:
319+
description: Represents the Garden Linux build commit id
320+
type: string
321+
gardenLinuxFeatures:
322+
description: Represents the Garden Linux Feature Set
323+
items:
324+
type: string
325+
type: array
318326
hardwareModel:
319327
description: HardwareModel
320328
type: string
@@ -336,6 +344,10 @@ spec:
336344
prettyVersion:
337345
description: PrettyVersion
338346
type: string
347+
variantID:
348+
description: Identifying a specific variant or edition of the
349+
operating system
350+
type: string
339351
version:
340352
description: Represents the Operating System version.
341353
type: string

internal/controller/aggregates_controller.go

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func (ac *AggregatesController) Reconcile(ctx context.Context, req ctrl.Request)
7070
return ctrl.Result{}, nil
7171
}
7272

73-
aggs, err := aggregatesByName(ctx, ac.computeClient)
73+
aggs, err := openstack.GetAggregatesByName(ctx, ac.computeClient)
7474
if err != nil {
7575
err = fmt.Errorf("failed listing aggregates: %w", err)
7676
if err2 := ac.setErrorCondition(ctx, hv, err.Error()); err2 != nil {
@@ -163,24 +163,6 @@ func (ac *AggregatesController) SetupWithManager(mgr ctrl.Manager) error {
163163
Complete(ac)
164164
}
165165

166-
func aggregatesByName(ctx context.Context, serviceClient *gophercloud.ServiceClient) (map[string]*aggregates.Aggregate, error) {
167-
pages, err := aggregates.List(serviceClient).AllPages(ctx)
168-
if err != nil {
169-
return nil, fmt.Errorf("cannot list aggregates due to %w", err)
170-
}
171-
172-
aggs, err := aggregates.ExtractAggregates(pages)
173-
if err != nil {
174-
return nil, fmt.Errorf("cannot list aggregates due to %w", err)
175-
}
176-
177-
aggregateMap := make(map[string]*aggregates.Aggregate, len(aggs))
178-
for _, aggregate := range aggs {
179-
aggregateMap[aggregate.Name] = &aggregate
180-
}
181-
return aggregateMap, nil
182-
}
183-
184166
func addToAggregate(ctx context.Context, serviceClient *gophercloud.ServiceClient, aggs map[string]*aggregates.Aggregate, host, name, zone string) (err error) {
185167
aggregate, found := aggs[name]
186168
log := logger.FromContext(ctx)

internal/controller/decomission_controller.go

Lines changed: 62 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -26,25 +26,24 @@ import (
2626

2727
"github.com/gophercloud/gophercloud/v2"
2828
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates"
29+
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/hypervisors"
2930
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/services"
3031
"github.com/gophercloud/gophercloud/v2/openstack/placement/v1/resourceproviders"
31-
corev1 "k8s.io/api/core/v1"
3232
"k8s.io/apimachinery/pkg/api/meta"
3333
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3434
"k8s.io/apimachinery/pkg/runtime"
3535
"k8s.io/client-go/util/retry"
3636
ctrl "sigs.k8s.io/controller-runtime"
3737
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
38-
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3938
logger "sigs.k8s.io/controller-runtime/pkg/log"
4039

4140
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
4241
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/openstack"
42+
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils"
4343
)
4444

4545
const (
46-
decommissionFinalizerName = "cobaltcore.cloud.sap/decommission-hypervisor"
47-
DecommissionControllerName = "nodeDecommission"
46+
DecommissionControllerName = "decommission"
4847
)
4948

5049
type NodeDecommissionReconciler struct {
@@ -57,145 +56,108 @@ type NodeDecommissionReconciler struct {
5756
// The counter-side in gardener is here:
5857
// https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L646
5958

60-
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;patch;update
61-
// +kubebuilder:rbac:groups="",resources=nodes/finalizers,verbs=update
6259
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch
6360
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;update;patch
61+
6462
func (r *NodeDecommissionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
65-
hostname := req.Name
66-
log := logger.FromContext(ctx).WithName(req.Name).WithValues("hostname", hostname)
67-
ctx = logger.IntoContext(ctx, log)
63+
log := logger.FromContext(ctx).WithName(req.Name)
64+
hv := &kvmv1.Hypervisor{}
65+
if err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
66+
if err := r.Get(ctx, req.NamespacedName, hv); err != nil {
67+
// ignore not found errors, could be deleted
68+
return k8sclient.IgnoreNotFound(err)
69+
}
6870

69-
node := &corev1.Node{}
70-
if err := r.Get(ctx, req.NamespacedName, node); err != nil {
71-
return ctrl.Result{}, k8sclient.IgnoreNotFound(err)
72-
}
71+
setDecommissioningCondition := func(msg string) {
72+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
73+
Type: kvmv1.ConditionTypeReady,
74+
Status: metav1.ConditionFalse,
75+
Reason: kvmv1.ConditionReasonDecomissioning,
76+
Message: msg,
77+
})
78+
}
7379

74-
// Fetch HV to check if lifecycle management is enabled
75-
hv := &kvmv1.Hypervisor{}
76-
if err := r.Get(ctx, k8sclient.ObjectKey{Name: hostname}, hv); err != nil {
77-
// ignore not found errors, could be deleted
78-
return ctrl.Result{}, k8sclient.IgnoreNotFound(err)
79-
}
80-
if !hv.Spec.LifecycleEnabled {
81-
// Get out of the way
82-
return r.removeFinalizer(ctx, node)
83-
}
80+
if meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeReady) {
81+
setDecommissioningCondition("Node is being decommissioned, removing host from nova")
82+
return r.Status().Update(ctx, hv)
83+
}
8484

85-
if !controllerutil.ContainsFinalizer(node, decommissionFinalizerName) {
86-
return ctrl.Result{}, retry.RetryOnConflict(retry.DefaultRetry, func() error {
87-
patch := k8sclient.MergeFrom(node.DeepCopy())
88-
controllerutil.AddFinalizer(node, decommissionFinalizerName)
89-
if err := r.Patch(ctx, node, patch); err != nil {
90-
return fmt.Errorf("failed to add finalizer due to %w", err)
85+
hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hv.Name, true)
86+
if err != nil {
87+
if errors.Is(err, openstack.ErrNoHypervisor) {
88+
// We are (hopefully) done
89+
setDecommissioningCondition("Node not registered in nova anymore, proceeding with deletion")
90+
hv.Status.Evicted = true
91+
return r.Status().Update(ctx, hv)
9192
}
92-
log.Info("Added finalizer")
93-
return nil
94-
})
95-
}
9693

97-
// Not yet deleting hv, nothing more to do
98-
if node.DeletionTimestamp.IsZero() {
99-
return ctrl.Result{}, nil
100-
}
94+
setDecommissioningCondition(fmt.Sprintf("Failed to get %q from openstack: %v", hv.Name, err))
95+
return r.Status().Update(ctx, hv)
96+
}
10197

102-
// Someone is just deleting the hv, without going through termination
103-
// See: https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L658-L659
104-
if !IsNodeConditionTrue(node.Status.Conditions, "Terminating") {
105-
log.Info("removing finalizer since not terminating")
106-
// So we just get out of the way for now
107-
return r.removeFinalizer(ctx, node)
108-
}
98+
if err = r.doDecomission(ctx, hv, hypervisor); err != nil {
99+
log.Error(err, "Failed to decomission node", "node", hv.Name)
100+
setDecommissioningCondition(err.Error())
101+
return r.Status().Update(ctx, hv)
102+
}
109103

110-
if meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeReady) {
111-
return r.setDecommissioningCondition(ctx, hv, "Node is being decommissioned, removing host from nova")
104+
// decomissioning succeeded, proceed with deletion
105+
hv.Status.Evicted = true
106+
return r.Status().Update(ctx, hv)
107+
}); err != nil {
108+
return ctrl.Result{}, err
112109
}
113110

114-
log.Info("removing host from nova")
115-
116-
hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hostname, true)
117-
if errors.Is(err, openstack.ErrNoHypervisor) {
118-
// We are (hopefully) done
119-
return r.removeFinalizer(ctx, node)
120-
}
111+
return ctrl.Result{RequeueAfter: utils.ShortRetryTime}, nil
112+
}
121113

114+
func (r *NodeDecommissionReconciler) doDecomission(ctx context.Context, hv *kvmv1.Hypervisor, hypervisor *hypervisors.Hypervisor) error {
122115
// TODO: remove since RunningVMs is only available until micro-version 2.87, and also is updated asynchronously
123116
// so it might be not accurate
124117
if hypervisor.RunningVMs > 0 {
125118
// Still running VMs, cannot delete the service
126-
msg := fmt.Sprintf("Node is being decommissioned, but still has %d running VMs", hypervisor.RunningVMs)
127-
return r.setDecommissioningCondition(ctx, hv, msg)
119+
return fmt.Errorf("node is being decommissioned, but still has %d running VMs", hypervisor.RunningVMs)
128120
}
129121

130122
if hypervisor.Servers != nil && len(*hypervisor.Servers) > 0 {
131123
// Still VMs assigned to the host, cannot delete the service
132-
msg := fmt.Sprintf("Node is being decommissioned, but still has %d assigned VMs, "+
133-
"check with `openstack server list --all-projects --host %s`", len(*hypervisor.Servers), hostname)
134-
return r.setDecommissioningCondition(ctx, hv, msg)
124+
return fmt.Errorf("node is being decommissioned, but still has %d assigned VMs, "+
125+
"check with `openstack server list --all-projects --host %s`", len(*hypervisor.Servers), hv.Name)
135126
}
136127

137-
// Before removing the service, first take the node out of the aggregates,
138-
// so when the node comes back, it doesn't up with the old associations
139-
aggs, err := aggregatesByName(ctx, r.computeClient)
128+
// Before removing the service, first take the hypervisor out of the aggregates,
129+
// so when the hypervisor comes back, it doesn't up with the old associations
130+
aggs, err := openstack.GetAggregatesByName(ctx, r.computeClient)
140131
if err != nil {
141-
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot list aggregates due to %v", err))
132+
return fmt.Errorf("cannot list aggregates due to: %w", err)
142133
}
143134

144-
host := node.Name
135+
host := hv.Name
145136
for name, aggregate := range aggs {
146137
if slices.Contains(aggregate.Hosts, host) {
147138
opts := aggregates.RemoveHostOpts{Host: host}
148139
if err = aggregates.RemoveHost(ctx, r.computeClient, aggregate.ID, opts).Err; err != nil {
149-
msg := fmt.Sprintf("failed to remove host %v from aggregate %v due to %v", name, host, err)
150-
return r.setDecommissioningCondition(ctx, hv, msg)
140+
return fmt.Errorf("failed to remove host %v from aggregate %v due to %w", name, host, err)
151141
}
152142
}
153143
}
154144

155145
// Deleting and evicted, so better delete the service
156146
err = services.Delete(ctx, r.computeClient, hypervisor.Service.ID).ExtractErr()
157147
if err != nil && !gophercloud.ResponseCodeIs(err, http.StatusNotFound) {
158-
msg := fmt.Sprintf("cannot delete service %s due to %v", hypervisor.Service.ID, err)
159-
return r.setDecommissioningCondition(ctx, hv, msg)
148+
return fmt.Errorf("cannot delete service %s due to %w", hypervisor.Service.ID, err)
160149
}
161150

162151
rp, err := resourceproviders.Get(ctx, r.placementClient, hypervisor.ID).Extract()
163152
if err != nil && !gophercloud.ResponseCodeIs(err, http.StatusNotFound) {
164-
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot get resource provider: %v", err))
153+
return fmt.Errorf("cannot get resource provider %s due to %w", hypervisor.ID, err)
165154
}
166155

167156
if err = openstack.CleanupResourceProvider(ctx, r.placementClient, rp); err != nil {
168-
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot clean up resource provider: %v", err))
169-
}
170-
171-
return r.removeFinalizer(ctx, node)
172-
}
173-
174-
func (r *NodeDecommissionReconciler) removeFinalizer(ctx context.Context, node *corev1.Node) (ctrl.Result, error) {
175-
if !controllerutil.ContainsFinalizer(node, decommissionFinalizerName) {
176-
return ctrl.Result{}, nil
157+
return fmt.Errorf("cannot cleanup resource provider: %w", err)
177158
}
178159

179-
nodeBase := node.DeepCopy()
180-
controllerutil.RemoveFinalizer(node, decommissionFinalizerName)
181-
err := r.Patch(ctx, node, k8sclient.MergeFromWithOptions(nodeBase,
182-
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName))
183-
return ctrl.Result{}, err
184-
}
185-
186-
func (r *NodeDecommissionReconciler) setDecommissioningCondition(ctx context.Context, hv *kvmv1.Hypervisor, message string) (ctrl.Result, error) {
187-
base := hv.DeepCopy()
188-
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
189-
Type: kvmv1.ConditionTypeReady,
190-
Status: metav1.ConditionFalse,
191-
Reason: "Decommissioning",
192-
Message: message,
193-
})
194-
if err := r.Status().Patch(ctx, hv, k8sclient.MergeFromWithOptions(base,
195-
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName)); err != nil {
196-
return ctrl.Result{}, fmt.Errorf("cannot update hypervisor status due to %w", err)
197-
}
198-
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
160+
return nil
199161
}
200162

201163
// SetupWithManager sets up the controller with the Manager.
@@ -217,6 +179,8 @@ func (r *NodeDecommissionReconciler) SetupWithManager(mgr ctrl.Manager) error {
217179

218180
return ctrl.NewControllerManagedBy(mgr).
219181
Named(DecommissionControllerName).
220-
For(&corev1.Node{}).
182+
For(&kvmv1.Hypervisor{}).
183+
WithEventFilter(utils.HypervisorTerminationPredicate).
184+
WithEventFilter(utils.LifecycleEnabledPredicate).
221185
Complete(r)
222186
}

0 commit comments

Comments
 (0)