Skip to content

Commit e6aaf8d

Browse files
fix(preflight): check storage containers on all failure domains (#1215)
If control plane or a machine deployment uses failure domains, use the cluster from the failure domain instead of machine details. **How has this been tested?** 1. Misconfigured storage container (doesn't actually exist) on cluster in machine details or on cluster in failure domain (e.g. `ncn-dev-sandbox-gpu` doesn't have `k8s` storage container) 2. Create failure domain ``` apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 kind: NutanixFailureDomain metadata: name: fd-2 namespace: default spec: prismElementCluster: type: name name: ncn-dev-sandbox-gpu subnets: - type: name name: subnet-2 ``` 3. Create a cluster ``` apiVersion: cluster.x-k8s.io/v1beta1 kind: Cluster metadata: ... spec: ... topology: class: nutanix-quick-start controlPlane: metadata: {} replicas: 3 variables: ... - name: workerConfig value: nutanix: machineDetails: bootType: uefi cluster: name: ncn-dev-sandbox-gpu type: name imageLookup: baseOS: rocky-9.6 format: nkp-{{.BaseOS}}-release-{{.K8sVersion}}-* memorySize: 4Gi subnets: - name: vlan173 type: name systemDiskSize: 40Gi vcpuSockets: 2 vcpusPerSocket: 1 version: 1.33.1 workers: machineDeployments: - class: default-worker name: md-variable-override variables: overrides: - name: workerConfig value: nutanix: machineDetails: ... cluster: name: ncn-dev-sandbox-gpu type: name - class: default-worker failureDomain: fd-2 name: md-variable-override-failure-domain variables: overrides: - name: workerConfig value: nutanix: machineDetails: ... cluster: name: ncn-dev-sandbox-gpu type: name ``` 4. Observe pre-flight failure on 2 machine deployments ``` The request is invalid: * $.spec.topology.workers.machineDeployments[[email protected]=="md-variable-override-failure-domain"].failureDomain: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. * $.spec.topology.workers.machineDeployments[[email protected]=="md-variable-override"].variables[[email protected]=workerConfig].value.nutanix.machineDetails: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. ```
1 parent f1a860c commit e6aaf8d

File tree

3 files changed

+358
-36
lines changed

3 files changed

+358
-36
lines changed

pkg/webhook/preflight/nutanix/clients_test.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
clustermgmtv4 "github.com/nutanix/ntnx-api-golang-clients/clustermgmt-go-client/v4/models/clustermgmt/v4/config"
1010
netv4 "github.com/nutanix/ntnx-api-golang-clients/networking-go-client/v4/models/networking/v4/config"
1111
vmmv4 "github.com/nutanix/ntnx-api-golang-clients/vmm-go-client/v4/models/vmm/v4/content"
12+
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
1213

1314
prismv3 "github.com/nutanix-cloud-native/prism-go-client/v3"
1415
)
@@ -72,6 +73,30 @@ type mocknclient struct {
7273
) (*netv4.ListSubnetsApiResponse, error)
7374
}
7475

76+
type mockKubeClient struct {
77+
ctrlclient.Client
78+
SubResourceClient ctrlclient.SubResourceClient
79+
getFunc func(
80+
ctx context.Context,
81+
key ctrlclient.ObjectKey,
82+
obj ctrlclient.Object,
83+
opts ...ctrlclient.GetOption,
84+
) error
85+
}
86+
87+
func (m *mockKubeClient) Get(
88+
ctx context.Context,
89+
key ctrlclient.ObjectKey,
90+
obj ctrlclient.Object,
91+
opts ...ctrlclient.GetOption,
92+
) error {
93+
return m.getFunc(ctx, key, obj, opts...)
94+
}
95+
96+
func (m *mockKubeClient) SubResource(subResource string) ctrlclient.SubResourceClient {
97+
return m.SubResourceClient
98+
}
99+
75100
func (m *mocknclient) GetCurrentLoggedInUser(ctx context.Context) (*prismv3.UserIntentResponse, error) {
76101
return m.user, m.err
77102
}

pkg/webhook/preflight/nutanix/storagecontainer.go

Lines changed: 113 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@ import (
99
"sync"
1010

1111
clustermgmtv4 "github.com/nutanix/ntnx-api-golang-clients/clustermgmt-go-client/v4/models/clustermgmt/v4/config"
12+
"k8s.io/apimachinery/pkg/api/errors"
13+
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
1214

13-
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/external/github.com/nutanix-cloud-native/cluster-api-provider-nutanix/api/v1beta1"
15+
capxv1 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/external/github.com/nutanix-cloud-native/cluster-api-provider-nutanix/api/v1beta1"
1416
carenv1 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/v1alpha1"
1517
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/webhook/preflight"
1618
)
@@ -20,10 +22,16 @@ const (
2022
)
2123

2224
type storageContainerCheck struct {
25+
// machineSpec is used for cluster identifier when no failure domain is configured
2326
machineSpec *carenv1.NutanixMachineDetails
24-
field string
25-
csiSpec *carenv1.CSIProvider
26-
nclient client
27+
// failureDomainName is used for cluster identifier when failure domain is configured
28+
failureDomainName string
29+
namespace string
30+
kclient ctrlclient.Client
31+
// Common fields
32+
field string
33+
csiSpec *carenv1.CSIProvider
34+
nclient client
2735
}
2836

2937
func (c *storageContainerCheck) Name() string {
@@ -45,7 +53,40 @@ func (c *storageContainerCheck) Run(ctx context.Context) preflight.CheckResult {
4553
return result
4654
}
4755

48-
clusterIdentifier := &c.machineSpec.Cluster
56+
// Get cluster identifier based on whether failure domain is configured
57+
var clusterIdentifier *capxv1.NutanixResourceIdentifier
58+
var err error
59+
60+
if c.failureDomainName != "" {
61+
// Get cluster identifier from failure domain
62+
clusterIdentifier, err = c.getClusterIdentifierFromFailureDomain(ctx)
63+
if err != nil {
64+
result.Allowed = false
65+
if errors.IsNotFound(err) {
66+
result.Causes = append(result.Causes, preflight.Cause{
67+
Message: fmt.Sprintf(
68+
"NutanixFailureDomain %q referenced in cluster was not found in the management cluster. Please create it and retry.", //nolint:lll // Message is long.
69+
c.failureDomainName,
70+
),
71+
Field: c.field,
72+
})
73+
} else {
74+
result.InternalError = true
75+
result.Causes = append(result.Causes, preflight.Cause{
76+
Message: fmt.Sprintf(
77+
"Failed to get cluster identifier from NutanixFailureDomain %q: %v This is usually a temporary error. Please retry.", //nolint:lll // Message is long.
78+
c.failureDomainName,
79+
err,
80+
),
81+
Field: c.field,
82+
})
83+
}
84+
return result
85+
}
86+
} else {
87+
// Use cluster identifier from machine spec
88+
clusterIdentifier = &c.machineSpec.Cluster
89+
}
4990

5091
// To avoid unnecessary API calls, we delay the retrieval of clusters until we actually
5192
// need to check for storage containers.
@@ -151,6 +192,19 @@ func (c *storageContainerCheck) Run(ctx context.Context) preflight.CheckResult {
151192
return result
152193
}
153194

195+
// getClusterIdentifierFromFailureDomain fetches the failure domain and returns the cluster identifier.
196+
func (c *storageContainerCheck) getClusterIdentifierFromFailureDomain(
197+
ctx context.Context,
198+
) (*capxv1.NutanixResourceIdentifier, error) {
199+
fdObj := &capxv1.NutanixFailureDomain{}
200+
fdKey := ctrlclient.ObjectKey{Name: c.failureDomainName, Namespace: c.namespace}
201+
if err := c.kclient.Get(ctx, fdKey, fdObj); err != nil {
202+
return nil, err
203+
}
204+
205+
return &fdObj.Spec.PrismElementCluster, nil
206+
}
207+
154208
func newStorageContainerChecks(cd *checkDependencies) []preflight.Check {
155209
checks := []preflight.Check{}
156210

@@ -168,14 +222,29 @@ func newStorageContainerChecks(cd *checkDependencies) []preflight.Check {
168222
if cd.nutanixClusterConfigSpec != nil &&
169223
cd.nutanixClusterConfigSpec.ControlPlane != nil &&
170224
cd.nutanixClusterConfigSpec.ControlPlane.Nutanix != nil {
171-
// Skip the check if failureDomains are configured
172-
if len(cd.nutanixClusterConfigSpec.ControlPlane.Nutanix.FailureDomains) > 0 {
173-
cd.log.V(5).
174-
Info("Skip preflight check NutanixStorageContainer for controlPlane with failureDomains configured.")
225+
controlPlaneNutanix := cd.nutanixClusterConfigSpec.ControlPlane.Nutanix
226+
227+
// Check if failureDomains are configured for control plane
228+
if len(controlPlaneNutanix.FailureDomains) > 0 && cd.cluster != nil && cd.kclient != nil {
229+
// Create a check for each failure domain
230+
for _, fdName := range controlPlaneNutanix.FailureDomains {
231+
if fdName != "" {
232+
checks = append(checks,
233+
&storageContainerCheck{
234+
failureDomainName: fdName,
235+
namespace: cd.cluster.Namespace,
236+
kclient: cd.kclient,
237+
field: "$.spec.topology.variables[[email protected]==\"clusterConfig\"].value.controlPlane.nutanix.failureDomains", //nolint:lll // The field is long.
238+
csiSpec: &cd.nutanixClusterConfigSpec.Addons.CSI.Providers.NutanixCSI,
239+
nclient: cd.nclient,
240+
},
241+
)
242+
}
243+
}
175244
} else {
176245
checks = append(checks,
177246
&storageContainerCheck{
178-
machineSpec: &cd.nutanixClusterConfigSpec.ControlPlane.Nutanix.MachineDetails,
247+
machineSpec: &controlPlaneNutanix.MachineDetails,
179248
field: "$.spec.topology.variables[[email protected]==\"clusterConfig\"].value.controlPlane.nutanix.machineDetails",
180249
csiSpec: &cd.nutanixClusterConfigSpec.Addons.CSI.Providers.NutanixCSI,
181250
nclient: cd.nclient,
@@ -186,27 +255,41 @@ func newStorageContainerChecks(cd *checkDependencies) []preflight.Check {
186255

187256
for mdName, nutanixWorkerNodeConfigSpec := range cd.nutanixWorkerNodeConfigSpecByMachineDeploymentName {
188257
if nutanixWorkerNodeConfigSpec.Nutanix != nil {
189-
// Skip the check if failureDomain is configured
190-
if fdName, ok := cd.failureDomainByMachineDeploymentName[mdName]; ok {
191-
cd.log.V(5).Info(
192-
"Skip preflight check NutanixStorageContainer for machineDeployment with failureDomain configured.",
193-
"machineDeployment", mdName, "failureDomain", fdName,
258+
// Check if failureDomain is configured for this machine deployment
259+
if fdName, ok := cd.failureDomainByMachineDeploymentName[mdName]; ok &&
260+
fdName != "" &&
261+
cd.cluster != nil &&
262+
cd.kclient != nil {
263+
// Use failure domain for cluster information
264+
checks = append(checks,
265+
&storageContainerCheck{
266+
failureDomainName: fdName,
267+
namespace: cd.cluster.Namespace,
268+
kclient: cd.kclient,
269+
270+
field: fmt.Sprintf(
271+
"$.spec.topology.workers.machineDeployments[[email protected]==%q].failureDomain",
272+
mdName,
273+
),
274+
csiSpec: &cd.nutanixClusterConfigSpec.Addons.CSI.Providers.NutanixCSI,
275+
nclient: cd.nclient,
276+
},
277+
)
278+
} else {
279+
// Use machine details for cluster information
280+
checks = append(checks,
281+
&storageContainerCheck{
282+
machineSpec: &nutanixWorkerNodeConfigSpec.Nutanix.MachineDetails,
283+
//nolint:lll // The field is long.
284+
field: fmt.Sprintf(
285+
"$.spec.topology.workers.machineDeployments[[email protected]==%q].variables[[email protected]=workerConfig].value.nutanix.machineDetails",
286+
mdName,
287+
),
288+
csiSpec: &cd.nutanixClusterConfigSpec.Addons.CSI.Providers.NutanixCSI,
289+
nclient: cd.nclient,
290+
},
194291
)
195-
continue
196292
}
197-
198-
checks = append(checks,
199-
&storageContainerCheck{
200-
machineSpec: &nutanixWorkerNodeConfigSpec.Nutanix.MachineDetails,
201-
//nolint:lll // The field is long.
202-
field: fmt.Sprintf(
203-
"$.spec.topology.workers.machineDeployments[[email protected]==%q].variables[[email protected]=workerConfig].value.nutanix.machineDetails",
204-
mdName,
205-
),
206-
csiSpec: &cd.nutanixClusterConfigSpec.Addons.CSI.Providers.NutanixCSI,
207-
nclient: cd.nclient,
208-
},
209-
)
210293
}
211294
}
212295

@@ -236,7 +319,7 @@ func getStorageContainers(
236319

237320
func getClusters(
238321
client client,
239-
clusterIdentifier *v1beta1.NutanixResourceIdentifier,
322+
clusterIdentifier *capxv1.NutanixResourceIdentifier,
240323
) ([]clustermgmtv4.Cluster, error) {
241324
switch {
242325
case clusterIdentifier.IsUUID():

0 commit comments

Comments
 (0)