diff --git a/azure/services/virtualmachines/virtualmachines.go b/azure/services/virtualmachines/virtualmachines.go index dcf2633f40d..b64cc8ffa8b 100644 --- a/azure/services/virtualmachines/virtualmachines.go +++ b/azure/services/virtualmachines/virtualmachines.go @@ -39,7 +39,14 @@ import ( ) const serviceName = "virtualmachine" -const vmMissingUAI = "VM is missing expected user assigned identity with client ID: " + +func vmMissingUAI(expectedKey string, actualIdentities []infrav1.UserAssignedIdentity) string { + var actual []string + for _, a := range actualIdentities { + actual = append(actual, a.ProviderID) + } + return "VM is missing expected user assigned identity with ID " + expectedKey + ", VM has identities " + strings.Join(actual, ", ") +} // VMScope defines the scope interface for a virtual machines service. type VMScope interface { @@ -173,7 +180,7 @@ func (s *Service) checkUserAssignedIdentities(specIdentities []infrav1.UserAssig for _, expectedIdentity := range specIdentities { _, exists := actualMap[expectedIdentity.ProviderID] if !exists { - s.Scope.SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI+expectedIdentity.ProviderID) + s.Scope.SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI(expectedIdentity.ProviderID, vmIdentities)) return } } diff --git a/azure/services/virtualmachines/virtualmachines_test.go b/azure/services/virtualmachines/virtualmachines_test.go index a3981683afe..0a42d9db3b6 100644 --- a/azure/services/virtualmachines/virtualmachines_test.go +++ b/azure/services/virtualmachines/virtualmachines_test.go @@ -358,7 +358,7 @@ func TestCheckUserAssignedIdentities(t *testing.T) { scopeMock := mock_virtualmachines.NewMockVMScope(mockCtrl) if tc.expectedKey != "" { - scopeMock.EXPECT().SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI+tc.expectedKey).Times(1) + scopeMock.EXPECT().SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI(tc.expectedKey, tc.actualIdentities)).Times(1) } s := &Service{ Scope: scopeMock, diff --git a/controllers/azuremachine_controller.go b/controllers/azuremachine_controller.go index 6466fabf3b2..b04c1c977f9 100644 --- a/controllers/azuremachine_controller.go +++ b/controllers/azuremachine_controller.go @@ -289,8 +289,8 @@ func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineS if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason { amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy") machineScope.SetFailureReason(azure.UnsupportedChange) - machineScope.SetFailureMessage(errors.New("VM identities are not ready")) - return reconcile.Result{}, errors.New("VM identities are not ready") + machineScope.SetFailureMessage(errors.New(cond.Message)) + return reconcile.Result{}, errors.New(cond.Message) } ams, err := amr.createAzureMachineService(machineScope) diff --git a/scripts/ci-entrypoint.sh b/scripts/ci-entrypoint.sh index 89253f917c9..0156d6b8e37 100755 --- a/scripts/ci-entrypoint.sh +++ b/scripts/ci-entrypoint.sh @@ -30,11 +30,15 @@ KIND="${REPO_ROOT}/hack/tools/bin/kind" KUSTOMIZE="${REPO_ROOT}/hack/tools/bin/kustomize" make --directory="${REPO_ROOT}" "${KUBECTL##*/}" "${HELM##*/}" "${KIND##*/}" "${KUSTOMIZE##*/}" KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:-capz}" -WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}" +EXTRA_NODES_PER_SCALEOUT="${EXTRA_NODES_PER_SCALEOUT:-100}" +export TOTAL_WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}" +WORKER_MACHINE_COUNT=0 export KIND_CLUSTER_NAME # export the variables so they are available in bash -c wait_for_nodes below export KUBECTL export HELM +export REPO_ROOT +export EXTRA_NODES_PER_SCALEOUT # shellcheck source=hack/ensure-go.sh source "${REPO_ROOT}/hack/ensure-go.sh" @@ -95,7 +99,7 @@ setup() { echo '' )}" export AZURE_RESOURCE_GROUP="${CLUSTER_NAME}" - if [ "${WORKER_MACHINE_COUNT}" -gt "10" ]; then + if [ "${TOTAL_WORKER_MACHINE_COUNT}" -gt "10" ]; then export AZURE_LOCATION="${AZURE_LOCATION:-$(capz::util::get_random_region_load)}" echo "Using AZURE_LOCATION: ${AZURE_LOCATION}" else @@ -112,7 +116,7 @@ setup() { # Need a cluster with at least 2 nodes export CONTROL_PLANE_MACHINE_COUNT="${CONTROL_PLANE_MACHINE_COUNT:-1}" export CCM_COUNT="${CCM_COUNT:-1}" - export WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}" + export WORKER_MACHINE_COUNT export EXP_CLUSTER_RESOURCE_SET="true" # TODO figure out a better way to account for expected Windows node count @@ -180,19 +184,26 @@ wait_for_copy_kubeadm_config_map() { # wait_for_nodes returns when all nodes in the workload cluster are Ready. wait_for_nodes() { - echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready" + while ((WORKER_MACHINE_COUNT < TOTAL_WORKER_MACHINE_COUNT)); do + WORKER_MACHINE_COUNT=$((WORKER_MACHINE_COUNT + EXTRA_NODES_PER_SCALEOUT)) + WORKER_MACHINE_COUNT=$((WORKER_MACHINE_COUNT > TOTAL_WORKER_MACHINE_COUNT ? TOTAL_WORKER_MACHINE_COUNT : WORKER_MACHINE_COUNT)) - # Ensure that all nodes are registered with the API server before checking for readiness - local total_nodes="$((CONTROL_PLANE_MACHINE_COUNT + WORKER_MACHINE_COUNT + WINDOWS_WORKER_MACHINE_COUNT))" - while [[ $("${KUBECTL}" get nodes -ojson | jq '.items | length') -ne "${total_nodes}" ]]; do - sleep 10 - done + "${KUBECTL}" --kubeconfig "${REPO_ROOT}/${KIND_CLUSTER_NAME}.kubeconfig" scale --namespace default machinedeployment/"${CLUSTER_NAME}"-md-0 --replicas="${WORKER_MACHINE_COUNT}" - until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m; do - sleep 5 + echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready" + + # Ensure that all nodes are registered with the API server before checking for readiness + local total_nodes="$((CONTROL_PLANE_MACHINE_COUNT + WORKER_MACHINE_COUNT + WINDOWS_WORKER_MACHINE_COUNT))" + while [[ $("${KUBECTL}" get nodes -ojson | jq '.items | length') -ne "${total_nodes}" ]]; do + sleep 10 + done + + until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m > /dev/null; do + sleep 5 + done done until "${KUBECTL}" get nodes -o wide; do - sleep 5 + sleep 5 done } @@ -221,7 +232,7 @@ install_addons() { # we need to wait a little bit for nodes and pods terminal state, # so we block successful return upon the cluster being fully operational. export -f wait_for_nodes - timeout --foreground 1800 bash -c wait_for_nodes + timeout --foreground "$((TOTAL_WORKER_MACHINE_COUNT > 100 ? 10800 : 1800))" bash -c wait_for_nodes export -f wait_for_pods timeout --foreground 1800 bash -c wait_for_pods } diff --git a/test/e2e/azure_clusterproxy.go b/test/e2e/azure_clusterproxy.go index 26dfba9f6f3..11c1445de72 100644 --- a/test/e2e/azure_clusterproxy.go +++ b/test/e2e/azure_clusterproxy.go @@ -82,8 +82,8 @@ func initScheme() *runtime.Scheme { } func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) { - Logf("Dumping workload cluster %s/%s logs", namespace, name) - acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath) + // Logf("Dumping workload cluster %s/%s logs", namespace, name) + // acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath) aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1) @@ -92,10 +92,10 @@ func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, na acp.collectNodes(ctx, namespace, name, aboveMachinesPath) Logf("Fetching nodes took %s", time.Since(start).String()) - Logf("Dumping workload cluster %s/%s pod logs", namespace, name) - start = time.Now() - acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath) - Logf("Fetching pod logs took %s", time.Since(start).String()) + // Logf("Dumping workload cluster %s/%s pod logs", namespace, name) + // start = time.Now() + // acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath) + // Logf("Fetching pod logs took %s", time.Since(start).String()) Logf("Dumping workload cluster %s/%s Azure activity log", namespace, name) start = time.Now()