kubernetes-sigs · nojnhuh · Jul 22, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/azure/services/virtualmachines/virtualmachines.go b/azure/services/virtualmachines/virtualmachines.go
@@ -39,7 +39,14 @@ import (
 )
 
 const serviceName = "virtualmachine"
-const vmMissingUAI = "VM is missing expected user assigned identity with client ID: "
+
+func vmMissingUAI(expectedKey string, actualIdentities []infrav1.UserAssignedIdentity) string {
+	var actual []string
+	for _, a := range actualIdentities {
+		actual = append(actual, a.ProviderID)
+	}
+	return "VM is missing expected user assigned identity with ID " + expectedKey + ", VM has identities " + strings.Join(actual, ", ")
+}
 
 // VMScope defines the scope interface for a virtual machines service.
 type VMScope interface {
@@ -173,7 +180,7 @@ func (s *Service) checkUserAssignedIdentities(specIdentities []infrav1.UserAssig
 	for _, expectedIdentity := range specIdentities {
 		_, exists := actualMap[expectedIdentity.ProviderID]
 		if !exists {
-			s.Scope.SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI+expectedIdentity.ProviderID)
+			s.Scope.SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI(expectedIdentity.ProviderID, vmIdentities))
 			return
 		}
 	}

diff --git a/azure/services/virtualmachines/virtualmachines_test.go b/azure/services/virtualmachines/virtualmachines_test.go
@@ -358,7 +358,7 @@ func TestCheckUserAssignedIdentities(t *testing.T) {
 			scopeMock := mock_virtualmachines.NewMockVMScope(mockCtrl)
 
 			if tc.expectedKey != "" {
-				scopeMock.EXPECT().SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI+tc.expectedKey).Times(1)
+				scopeMock.EXPECT().SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI(tc.expectedKey, tc.actualIdentities)).Times(1)
 			}
 			s := &Service{
 				Scope: scopeMock,

diff --git a/controllers/azuremachine_controller.go b/controllers/azuremachine_controller.go
@@ -289,8 +289,8 @@ func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineS
 	if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason {
 		amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy")
 		machineScope.SetFailureReason(azure.UnsupportedChange)
-		machineScope.SetFailureMessage(errors.New("VM identities are not ready"))
-		return reconcile.Result{}, errors.New("VM identities are not ready")
+		machineScope.SetFailureMessage(errors.New(cond.Message))
+		return reconcile.Result{}, errors.New(cond.Message)
 	}
 
 	ams, err := amr.createAzureMachineService(machineScope)

diff --git a/scripts/ci-entrypoint.sh b/scripts/ci-entrypoint.sh
@@ -30,11 +30,15 @@ KIND="${REPO_ROOT}/hack/tools/bin/kind"
 KUSTOMIZE="${REPO_ROOT}/hack/tools/bin/kustomize"
 make --directory="${REPO_ROOT}" "${KUBECTL##*/}" "${HELM##*/}" "${KIND##*/}" "${KUSTOMIZE##*/}"
 KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:-capz}"
-WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}"
+EXTRA_NODES_PER_SCALEOUT="${EXTRA_NODES_PER_SCALEOUT:-100}"
+export TOTAL_WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}"
+WORKER_MACHINE_COUNT=0
 export KIND_CLUSTER_NAME
 # export the variables so they are available in bash -c wait_for_nodes below
 export KUBECTL
 export HELM
+export REPO_ROOT
+export EXTRA_NODES_PER_SCALEOUT
 
 # shellcheck source=hack/ensure-go.sh
 source "${REPO_ROOT}/hack/ensure-go.sh"
@@ -95,7 +99,7 @@ setup() {
         echo ''
     )}"
     export AZURE_RESOURCE_GROUP="${CLUSTER_NAME}"
-    if [ "${WORKER_MACHINE_COUNT}" -gt "10" ]; then
+    if [ "${TOTAL_WORKER_MACHINE_COUNT}" -gt "10" ]; then
         export AZURE_LOCATION="${AZURE_LOCATION:-$(capz::util::get_random_region_load)}"
         echo "Using AZURE_LOCATION: ${AZURE_LOCATION}"
     else
@@ -112,7 +116,7 @@ setup() {
     # Need a cluster with at least 2 nodes
     export CONTROL_PLANE_MACHINE_COUNT="${CONTROL_PLANE_MACHINE_COUNT:-1}"
     export CCM_COUNT="${CCM_COUNT:-1}"
-    export WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}"
+    export WORKER_MACHINE_COUNT
     export EXP_CLUSTER_RESOURCE_SET="true"
 
     # TODO figure out a better way to account for expected Windows node count
@@ -180,19 +184,26 @@ wait_for_copy_kubeadm_config_map() {
 
 # wait_for_nodes returns when all nodes in the workload cluster are Ready.
 wait_for_nodes() {
-    echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready"
+    while ((WORKER_MACHINE_COUNT < TOTAL_WORKER_MACHINE_COUNT)); do
+        WORKER_MACHINE_COUNT=$((WORKER_MACHINE_COUNT + EXTRA_NODES_PER_SCALEOUT))
+        WORKER_MACHINE_COUNT=$((WORKER_MACHINE_COUNT > TOTAL_WORKER_MACHINE_COUNT ? TOTAL_WORKER_MACHINE_COUNT : WORKER_MACHINE_COUNT))
 
-    # Ensure that all nodes are registered with the API server before checking for readiness
-    local total_nodes="$((CONTROL_PLANE_MACHINE_COUNT + WORKER_MACHINE_COUNT + WINDOWS_WORKER_MACHINE_COUNT))"
-    while [[ $("${KUBECTL}" get nodes -ojson | jq '.items | length') -ne "${total_nodes}" ]]; do
-        sleep 10
-    done
+        "${KUBECTL}" --kubeconfig "${REPO_ROOT}/${KIND_CLUSTER_NAME}.kubeconfig" scale --namespace default machinedeployment/"${CLUSTER_NAME}"-md-0 --replicas="${WORKER_MACHINE_COUNT}"
 
-    until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m; do
-        sleep 5
+        echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready"
+
+        # Ensure that all nodes are registered with the API server before checking for readiness
+        local total_nodes="$((CONTROL_PLANE_MACHINE_COUNT + WORKER_MACHINE_COUNT + WINDOWS_WORKER_MACHINE_COUNT))"
+        while [[ $("${KUBECTL}" get nodes -ojson | jq '.items | length') -ne "${total_nodes}" ]]; do
+            sleep 10
+        done
+
+        until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m > /dev/null; do
+            sleep 5
+        done
     done
     until "${KUBECTL}" get nodes -o wide; do
-        sleep 5
+      sleep 5
     done
 }
 
@@ -221,7 +232,7 @@ install_addons() {
     # we need to wait a little bit for nodes and pods terminal state,
     # so we block successful return upon the cluster being fully operational.
     export -f wait_for_nodes
-    timeout --foreground 1800 bash -c wait_for_nodes
+    timeout --foreground "$((TOTAL_WORKER_MACHINE_COUNT > 100 ? 10800 : 1800))" bash -c wait_for_nodes
     export -f wait_for_pods
     timeout --foreground 1800 bash -c wait_for_pods
 }

diff --git a/test/e2e/azure_clusterproxy.go b/test/e2e/azure_clusterproxy.go
@@ -82,8 +82,8 @@
 }
 
 func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) {
-	Logf("Dumping workload cluster %s/%s logs", namespace, name)
-	acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath)
+	// Logf("Dumping workload cluster %s/%s logs", namespace, name)
+	// acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath)
 
 	aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1)
 
@@ -92,10 +92,10 @@
 	acp.collectNodes(ctx, namespace, name, aboveMachinesPath)
 	Logf("Fetching nodes took %s", time.Since(start).String())
 
-	Logf("Dumping workload cluster %s/%s pod logs", namespace, name)
-	start = time.Now()
-	acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath)
-	Logf("Fetching pod logs took %s", time.Since(start).String())
+	// Logf("Dumping workload cluster %s/%s pod logs", namespace, name)
+	// start = time.Now()
+	// acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath)
+	// Logf("Fetching pod logs took %s", time.Since(start).String())
 
 	Logf("Dumping workload cluster %s/%s Azure activity log", namespace, name)
 	start = time.Now()
@@ -103,7 +103,7 @@
 	Logf("Fetching activity logs took %s", time.Since(start).String())
 }

 func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) {
 	workload := acp.GetWorkloadCluster(ctx, namespace, name)
 	pods := &corev1.PodList{}

@@ -143,7 +143,7 @@
 	}
 }

 func collectContainerLogs(ctx context.Context, pod corev1.Pod, container corev1.Container, aboveMachinesPath string, workload framework.ClusterProxy) {
 	defer GinkgoRecover()

 	podNamespace := pod.GetNamespace()