fix(preflight): ensure MDs without overrides are also checked (#1216)

thunderboltsid · web-flow · commit c145c9915fda · 2025-07-17T15:06:24.000-07:00
We were not adding preflight checks for worker machine deployments that did not have variable overrides set. Note: this is stacked on top of #1215 mostly for unit tests. Adding a do not merge label until #1215 is merged. **How has this been tested?** 1. Misconfigured storage container (doesn't actually exist) on cluster in machine details or on cluster in failure domain (e.g. `ncn-dev-sandbox-gpu` doesn't have `k8s` storage container) 2. Create failure domain ``` apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 kind: NutanixFailureDomain metadata: name: fd-2 namespace: default spec: prismElementCluster: type: name name: ncn-dev-sandbox-gpu subnets: - type: name name: subnet-2 ``` 3. Create a cluster without the changes in this PR ``` apiVersion: cluster.x-k8s.io/v1beta1 kind: Cluster metadata: ... spec: ... topology: class: nutanix-quick-start controlPlane: metadata: {} replicas: 3 variables: ... - name: workerConfig value: nutanix: machineDetails: bootType: uefi cluster: name: ncn-dev-sandbox-gpu type: name imageLookup: baseOS: rocky-9.6 format: nkp-{{.BaseOS}}-release-{{.K8sVersion}}-* memorySize: 4Gi subnets: - name: vlan173 type: name systemDiskSize: 40Gi vcpuSockets: 2 vcpusPerSocket: 1 version: 1.33.1 workers: machineDeployments: - class: default-worker name: md-variable-override variables: overrides: - name: workerConfig value: nutanix: machineDetails: ... cluster: name: ncn-dev-sandbox-gpu type: name - class: default-worker failureDomain: fd-2 name: md-variable-override-failure-domain variables: overrides: - name: workerConfig value: nutanix: machineDetails: ... cluster: name: ncn-dev-sandbox-gpu type: name - class: default-worker name: md-no-overrides - class: default-worker failureDomain: fd-2 name: md-no-overrides-failure-domain ``` 4. Observer pre-flight failure on only 2 machine deployments ``` The request is invalid: * $.spec.topology.workers.machineDeployments[?@.name=="md-variable-override-failure-domain"].failureDomain: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. * $.spec.topology.workers.machineDeployments[?@.name=="md-variable-override"].variables[?@.name=workerConfig].value.nutanix.machineDetails: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. ``` 6. Create the same cluster with changes in this PR and observe pre-flight failure on all 4 machine deployments ``` The request is invalid: * $.spec.topology.workers.machineDeployments[?@.name=="md-variable-override-failure-domain"].failureDomain: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. * $.spec.topology.workers.machineDeployments[?@.name=="md-no-overrides"].variables[?@.name=workerConfig].value.nutanix.machineDetails: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. * $.spec.topology.workers.machineDeployments[?@.name=="md-no-overrides-failure-domain"].failureDomain: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. * $.spec.topology.workers.machineDeployments[?@.name=="md-variable-override"].variables[?@.name=workerConfig].value.nutanix.machineDetails: Found no Storage Containers with name "k8s" on Cluster "ncn-dev-sandbox-gpu". Create a Storage Container with this name on Cluster "ncn-dev-sandbox-gpu", and then retry. ```
diff --git a/pkg/webhook/preflight/nutanix/specs.go b/pkg/webhook/preflight/nutanix/specs.go
@@ -7,6 +7,8 @@ import (
 	"context"
 	"fmt"
 
+	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+
 	carenv1 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/v1alpha1"
 	"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/variables"
 	"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/webhook/preflight"
@@ -77,12 +79,41 @@ func newConfigurationCheck(
 				failureDomainByMachineDeploymentName[md.Name] = *md.FailureDomain
 			}
 
-			if md.Variables == nil {
+			var workerConfigVar *clusterv1.ClusterVariable
+			var workerConfigFieldPath string
+			if md.Variables != nil {
+				workerConfigVar = variables.GetClusterVariableByName(
+					carenv1.WorkerConfigVariableName,
+					md.Variables.Overrides,
+				)
+				if workerConfigVar != nil {
+					workerConfigFieldPath = fmt.Sprintf(
+						"$.spec.topology.workers.machineDeployments[?@.name==%q].variables[?@.name=='%s'].value.nutanix",
+						md.Name,
+						carenv1.WorkerConfigVariableName,
+					)
+				}
+			}
+			if workerConfigVar == nil {
+				workerConfigVar = variables.GetClusterVariableByName(
+					carenv1.WorkerConfigVariableName,
+					cd.cluster.Spec.Topology.Variables,
+				)
+				if workerConfigVar != nil {
+					workerConfigFieldPath = fmt.Sprintf(
+						"$.spec.topology.variables[?@.name=='%s'].value.nutanix",
+						carenv1.WorkerConfigVariableName,
+					)
+				}
+			}
+
+			if workerConfigVar == nil {
 				continue
 			}
+
 			nutanixWorkerNodeConfigSpec := &carenv1.NutanixWorkerNodeConfigSpec{}
 			err := variables.UnmarshalClusterVariable(
-				variables.GetClusterVariableByName(carenv1.WorkerConfigVariableName, md.Variables.Overrides),
+				workerConfigVar,
 				nutanixWorkerNodeConfigSpec,
 			)
 			if err != nil {
@@ -92,24 +123,22 @@ func newConfigurationCheck(
 				configurationCheck.result.Causes = append(configurationCheck.result.Causes,
 					preflight.Cause{
 						Message: fmt.Sprintf(
-							"Failed to unmarshal topology machineDeployment variable %q: %s. Review the Cluster.", ///nolint:lll // Message is long.
+							"Failed to unmarshal variable %q: %s. Review the Cluster.", ///nolint:lll // Message is long.
 							carenv1.WorkerConfigVariableName,
 							err,
 						),
-						//nolint:lll // The field is long.
-						Field: fmt.Sprintf(
-							"$.spec.topology.workers.machineDeployments[?@.name==%q].variables[?@.name=workerConfig].value.nutanix.machineDetails",
-							md.Name,
-						),
+						Field: workerConfigFieldPath,
 					},
 				)
 			}
+
 			// Save the NutanixWorkerNodeConfigSpec only if it contains Nutanix configuration.
 			if nutanixWorkerNodeConfigSpec.Nutanix != nil {
 				nutanixWorkerNodeConfigSpecByMachineDeploymentName[md.Name] = nutanixWorkerNodeConfigSpec
 			}
 		}
 	}
+
 	// Save the NutanixWorkerNodeConfigSpecByMachineDeploymentName only if it contains at least one Nutanix configuration.
 	if len(nutanixWorkerNodeConfigSpecByMachineDeploymentName) > 0 {
 		cd.nutanixWorkerNodeConfigSpecByMachineDeploymentName = nutanixWorkerNodeConfigSpecByMachineDeploymentName
diff --git a/pkg/webhook/preflight/nutanix/specs_test.go b/pkg/webhook/preflight/nutanix/specs_test.go
@@ -10,6 +10,7 @@ import (
 	"github.com/go-logr/logr/testr"
 	"github.com/stretchr/testify/assert"
 	v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
+	"k8s.io/utils/ptr"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 
 	carenv1 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/v1alpha1"
@@ -58,7 +59,7 @@ func TestNewConfigurationCheck(t *testing.T) {
 								Name: carenv1.ClusterConfigVariableName,
 								Value: v1.JSON{
 									Raw: []byte(
-										`{"controlPlane": {"nutanix": {"prismElement": {"address": "pe.example.com"}}}}`,
+										`{"controlPlane": {"nutanix": {"cluster": {"name": "cluster-from-variable"}}}}`,
 									),
 								},
 							},
@@ -96,7 +97,7 @@ func TestNewConfigurationCheck(t *testing.T) {
 												Name: carenv1.WorkerConfigVariableName,
 												Value: v1.JSON{
 													Raw: []byte(
-														`{"nutanix": {"prismElement": {"address": "pe.example.com"}}}`,
+														`{"nutanix": {"cluster": {"name": "cluster-from-override"}}}`,
 													),
 												},
 											},
@@ -183,8 +184,8 @@ func TestNewConfigurationCheck(t *testing.T) {
 				InternalError: true,
 				Causes: []preflight.Cause{
 					{
-						Message: "Failed to unmarshal topology machineDeployment variable \"workerConfig\": failed to unmarshal json: invalid character 'i' looking for beginning of object key string. Review the Cluster.", ///nolint:lll // The message is long.
-						Field:   "$.spec.topology.workers.machineDeployments[?@.name==\"md-0\"].variables[?@.name=workerConfig].value.nutanix.machineDetails",                                                                ///nolint:lll // The field is long.
+						Message: "Failed to unmarshal variable \"workerConfig\": failed to unmarshal json: invalid character 'i' looking for beginning of object key string. Review the Cluster.", ///nolint:lll // The message is long.
+						Field:   "$.spec.topology.workers.machineDeployments[?@.name==\"md-0\"].variables[?@.name=='workerConfig'].value.nutanix",                                                 ///nolint:lll // The field is long.
 					},
 				},
 			},
@@ -238,7 +239,7 @@ func TestNewConfigurationCheck(t *testing.T) {
 												Name: carenv1.WorkerConfigVariableName,
 												Value: v1.JSON{
 													Raw: []byte(
-														`{"nutanix": {"prismElement": {"address": "pe1.example.com"}}}`,
+														`{"nutanix": {"cluster": {"name": "cluster-from-override"}}}`,
 													),
 												},
 											},
@@ -253,7 +254,7 @@ func TestNewConfigurationCheck(t *testing.T) {
 												Name: carenv1.WorkerConfigVariableName,
 												Value: v1.JSON{
 													Raw: []byte(
-														`{"nutanix": {"prismElement": {"address": "pe2.example.com"}}}`,
+														`{"nutanix": {"cluster": {"name": "cluster-from-override"}}}`,
 													),
 												},
 											},
@@ -272,6 +273,89 @@ func TestNewConfigurationCheck(t *testing.T) {
 			expectedWorkerNodeConfigSpecMapNotEmpty:   true,
 			expectedWorkerNodeConfigSpecMapEntryCount: 2,
 		},
+		{
+			name: "worker config from cluster variables",
+			cluster: &clusterv1.Cluster{
+				Spec: clusterv1.ClusterSpec{
+					Topology: &clusterv1.Topology{
+						Variables: []clusterv1.ClusterVariable{
+							{
+								Name: carenv1.ClusterConfigVariableName,
+								Value: v1.JSON{
+									Raw: []byte(`{}`),
+								},
+							},
+							{
+								Name: carenv1.WorkerConfigVariableName,
+								Value: v1.JSON{
+									Raw: []byte(
+										`{"nutanix": {"cluster": {"name": "cluster-from-variable"}}}`,
+									),
+								},
+							},
+						},
+						Workers: &clusterv1.WorkersTopology{
+							MachineDeployments: []clusterv1.MachineDeploymentTopology{
+								{
+									Name: "md-0",
+								},
+							},
+						},
+					},
+				},
+			},
+			expectedResult: preflight.CheckResult{
+				Allowed: true,
+			},
+			expectedNutanixClusterConfigSpec:          false,
+			expectedWorkerNodeConfigSpecMapNotEmpty:   true,
+			expectedWorkerNodeConfigSpecMapEntryCount: 1,
+		},
+		{
+			name: "worker config with failure domain",
+			cluster: &clusterv1.Cluster{
+				Spec: clusterv1.ClusterSpec{
+					Topology: &clusterv1.Topology{
+						Variables: []clusterv1.ClusterVariable{
+							{
+								Name: carenv1.ClusterConfigVariableName,
+								Value: v1.JSON{
+									Raw: []byte(
+										`{"nutanix": {"failureDomains": ["fd-1", "fd-2", "fd-3"]}}`,
+									),
+								},
+							},
+						},
+						Workers: &clusterv1.WorkersTopology{
+							MachineDeployments: []clusterv1.MachineDeploymentTopology{
+								{
+									Name: "md-0",
+									Variables: &clusterv1.MachineDeploymentVariables{
+										Overrides: []clusterv1.ClusterVariable{
+											{
+												Name: carenv1.WorkerConfigVariableName,
+												Value: v1.JSON{
+													Raw: []byte(
+														`{"nutanix": {"cluster": {"name": "worker-cluster"}, "subnets": [{"name": "worker-subnet"}]}}`,
+													),
+												},
+											},
+										},
+									},
+									FailureDomain: ptr.To("fd-1"),
+								},
+							},
+						},
+					},
+				},
+			},
+			expectedResult: preflight.CheckResult{
+				Allowed: true,
+			},
+			expectedNutanixClusterConfigSpec:          true,
+			expectedWorkerNodeConfigSpecMapNotEmpty:   true,
+			expectedWorkerNodeConfigSpecMapEntryCount: 1,
+		},
 		{
 			name: "worker config without nutanix field",
 			cluster: &clusterv1.Cluster{
@@ -342,6 +426,77 @@ func TestNewConfigurationCheck(t *testing.T) {
 			expectedWorkerNodeConfigSpecMapNotEmpty:   false,
 			expectedWorkerNodeConfigSpecMapEntryCount: 0,
 		},
+		{
+			name: "mixed worker scenarios - with/without overrides and with/without failure domains",
+			cluster: &clusterv1.Cluster{
+				Spec: clusterv1.ClusterSpec{
+					Topology: &clusterv1.Topology{
+						Variables: []clusterv1.ClusterVariable{
+							{
+								Name: carenv1.ClusterConfigVariableName,
+								Value: v1.JSON{
+									Raw: []byte(`{"nutanix": {"failureDomains": ["fd-1", "fd-2", "fd-3"]}}`),
+								},
+							},
+							{
+								Name: carenv1.WorkerConfigVariableName,
+								Value: v1.JSON{
+									Raw: []byte(`{"nutanix": {"cluster": {"name": "cluster-from-variable"}}}`),
+								},
+							},
+						},
+						Workers: &clusterv1.WorkersTopology{
+							MachineDeployments: []clusterv1.MachineDeploymentTopology{
+								{
+									Name: "md-with-overrides",
+									Variables: &clusterv1.MachineDeploymentVariables{
+										Overrides: []clusterv1.ClusterVariable{
+											{
+												Name: carenv1.WorkerConfigVariableName,
+												Value: v1.JSON{
+													Raw: []byte(
+														`{"nutanix": {"cluster": {"name": "cluster-from-override"}}}`,
+													),
+												},
+											},
+										},
+									},
+								},
+								{
+									Name: "md-without-overrides",
+								},
+								{
+									Name:          "md-with-overrides-and-fd",
+									FailureDomain: ptr.To("fd-1"),
+									Variables: &clusterv1.MachineDeploymentVariables{
+										Overrides: []clusterv1.ClusterVariable{
+											{
+												Name: carenv1.WorkerConfigVariableName,
+												Value: v1.JSON{
+													Raw: []byte(
+														`{"nutanix": {"cluster": {"name": "cluster-from-override"}}}`,
+													),
+												},
+											},
+										},
+									},
+								},
+								{
+									Name:          "md-without-overrides-and-fd",
+									FailureDomain: ptr.To("fd-1"),
+								},
+							},
+						},
+					},
+				},
+			},
+			expectedResult: preflight.CheckResult{
+				Allowed: true,
+			},
+			expectedNutanixClusterConfigSpec:          true,
+			expectedWorkerNodeConfigSpecMapNotEmpty:   true,
+			expectedWorkerNodeConfigSpecMapEntryCount: 4,
+		},
 	}
 
 	for _, tt := range tests {