Skip to content

ci-entrypoint.sh: Incrementally scale large clusters #5779

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions azure/services/virtualmachines/virtualmachines.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,14 @@ import (
)

const serviceName = "virtualmachine"
const vmMissingUAI = "VM is missing expected user assigned identity with client ID: "

func vmMissingUAI(expectedKey string, actualIdentities []infrav1.UserAssignedIdentity) string {
var actual []string
for _, a := range actualIdentities {
actual = append(actual, a.ProviderID)
}
return "VM is missing expected user assigned identity with ID " + expectedKey + ", VM has identities " + strings.Join(actual, ", ")
}

// VMScope defines the scope interface for a virtual machines service.
type VMScope interface {
Expand Down Expand Up @@ -173,7 +180,7 @@ func (s *Service) checkUserAssignedIdentities(specIdentities []infrav1.UserAssig
for _, expectedIdentity := range specIdentities {
_, exists := actualMap[expectedIdentity.ProviderID]
if !exists {
s.Scope.SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI+expectedIdentity.ProviderID)
s.Scope.SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI(expectedIdentity.ProviderID, vmIdentities))
return
}
}
Expand Down
2 changes: 1 addition & 1 deletion azure/services/virtualmachines/virtualmachines_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ func TestCheckUserAssignedIdentities(t *testing.T) {
scopeMock := mock_virtualmachines.NewMockVMScope(mockCtrl)

if tc.expectedKey != "" {
scopeMock.EXPECT().SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI+tc.expectedKey).Times(1)
scopeMock.EXPECT().SetConditionFalse(infrav1.VMIdentitiesReadyCondition, infrav1.UserAssignedIdentityMissingReason, clusterv1.ConditionSeverityWarning, vmMissingUAI(tc.expectedKey, tc.actualIdentities)).Times(1)
}
s := &Service{
Scope: scopeMock,
Expand Down
4 changes: 2 additions & 2 deletions controllers/azuremachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineS
if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason {
amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy")
machineScope.SetFailureReason(azure.UnsupportedChange)
machineScope.SetFailureMessage(errors.New("VM identities are not ready"))
return reconcile.Result{}, errors.New("VM identities are not ready")
machineScope.SetFailureMessage(errors.New(cond.Message))
return reconcile.Result{}, errors.New(cond.Message)
}

ams, err := amr.createAzureMachineService(machineScope)
Expand Down
37 changes: 24 additions & 13 deletions scripts/ci-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@ KIND="${REPO_ROOT}/hack/tools/bin/kind"
KUSTOMIZE="${REPO_ROOT}/hack/tools/bin/kustomize"
make --directory="${REPO_ROOT}" "${KUBECTL##*/}" "${HELM##*/}" "${KIND##*/}" "${KUSTOMIZE##*/}"
KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:-capz}"
WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}"
EXTRA_NODES_PER_SCALEOUT="${EXTRA_NODES_PER_SCALEOUT:-100}"
export TOTAL_WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}"
WORKER_MACHINE_COUNT=0
export KIND_CLUSTER_NAME
# export the variables so they are available in bash -c wait_for_nodes below
export KUBECTL
export HELM
export REPO_ROOT
export EXTRA_NODES_PER_SCALEOUT

# shellcheck source=hack/ensure-go.sh
source "${REPO_ROOT}/hack/ensure-go.sh"
Expand Down Expand Up @@ -95,7 +99,7 @@ setup() {
echo ''
)}"
export AZURE_RESOURCE_GROUP="${CLUSTER_NAME}"
if [ "${WORKER_MACHINE_COUNT}" -gt "10" ]; then
if [ "${TOTAL_WORKER_MACHINE_COUNT}" -gt "10" ]; then
export AZURE_LOCATION="${AZURE_LOCATION:-$(capz::util::get_random_region_load)}"
echo "Using AZURE_LOCATION: ${AZURE_LOCATION}"
else
Expand All @@ -112,7 +116,7 @@ setup() {
# Need a cluster with at least 2 nodes
export CONTROL_PLANE_MACHINE_COUNT="${CONTROL_PLANE_MACHINE_COUNT:-1}"
export CCM_COUNT="${CCM_COUNT:-1}"
export WORKER_MACHINE_COUNT="${WORKER_MACHINE_COUNT:-2}"
export WORKER_MACHINE_COUNT
export EXP_CLUSTER_RESOURCE_SET="true"

# TODO figure out a better way to account for expected Windows node count
Expand Down Expand Up @@ -180,19 +184,26 @@ wait_for_copy_kubeadm_config_map() {

# wait_for_nodes returns when all nodes in the workload cluster are Ready.
wait_for_nodes() {
echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready"
while ((WORKER_MACHINE_COUNT < TOTAL_WORKER_MACHINE_COUNT)); do
WORKER_MACHINE_COUNT=$((WORKER_MACHINE_COUNT + EXTRA_NODES_PER_SCALEOUT))
WORKER_MACHINE_COUNT=$((WORKER_MACHINE_COUNT > TOTAL_WORKER_MACHINE_COUNT ? TOTAL_WORKER_MACHINE_COUNT : WORKER_MACHINE_COUNT))

# Ensure that all nodes are registered with the API server before checking for readiness
local total_nodes="$((CONTROL_PLANE_MACHINE_COUNT + WORKER_MACHINE_COUNT + WINDOWS_WORKER_MACHINE_COUNT))"
while [[ $("${KUBECTL}" get nodes -ojson | jq '.items | length') -ne "${total_nodes}" ]]; do
sleep 10
done
"${KUBECTL}" --kubeconfig "${REPO_ROOT}/${KIND_CLUSTER_NAME}.kubeconfig" scale --namespace default machinedeployment/"${CLUSTER_NAME}"-md-0 --replicas="${WORKER_MACHINE_COUNT}"

until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m; do
sleep 5
echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready"

# Ensure that all nodes are registered with the API server before checking for readiness
local total_nodes="$((CONTROL_PLANE_MACHINE_COUNT + WORKER_MACHINE_COUNT + WINDOWS_WORKER_MACHINE_COUNT))"
while [[ $("${KUBECTL}" get nodes -ojson | jq '.items | length') -ne "${total_nodes}" ]]; do
sleep 10
done

until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m > /dev/null; do
sleep 5
done
done
until "${KUBECTL}" get nodes -o wide; do
sleep 5
sleep 5
done
}

Expand Down Expand Up @@ -221,7 +232,7 @@ install_addons() {
# we need to wait a little bit for nodes and pods terminal state,
# so we block successful return upon the cluster being fully operational.
export -f wait_for_nodes
timeout --foreground 1800 bash -c wait_for_nodes
timeout --foreground "$((TOTAL_WORKER_MACHINE_COUNT > 100 ? 10800 : 1800))" bash -c wait_for_nodes
export -f wait_for_pods
timeout --foreground 1800 bash -c wait_for_pods
}
Expand Down
12 changes: 6 additions & 6 deletions test/e2e/azure_clusterproxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@
}

func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) {
Logf("Dumping workload cluster %s/%s logs", namespace, name)
acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath)
// Logf("Dumping workload cluster %s/%s logs", namespace, name)

Check failure on line 85 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint (test)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 85 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint

commentedOutCode: may want to remove commented-out code (gocritic)
// acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath)

aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1)

Expand All @@ -92,10 +92,10 @@
acp.collectNodes(ctx, namespace, name, aboveMachinesPath)
Logf("Fetching nodes took %s", time.Since(start).String())

Logf("Dumping workload cluster %s/%s pod logs", namespace, name)
start = time.Now()
acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath)
Logf("Fetching pod logs took %s", time.Since(start).String())
// Logf("Dumping workload cluster %s/%s pod logs", namespace, name)

Check failure on line 95 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint (test)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 95 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint

commentedOutCode: may want to remove commented-out code (gocritic)
// start = time.Now()
// acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath)
// Logf("Fetching pod logs took %s", time.Since(start).String())

Logf("Dumping workload cluster %s/%s Azure activity log", namespace, name)
start = time.Now()
Expand All @@ -103,7 +103,7 @@
Logf("Fetching activity logs took %s", time.Since(start).String())
}

func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) {

Check failure on line 106 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint (test)

func (*AzureClusterProxy).collectPodLogs is unused (unused)

Check failure on line 106 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint

func (*AzureClusterProxy).collectPodLogs is unused (unused)
workload := acp.GetWorkloadCluster(ctx, namespace, name)
pods := &corev1.PodList{}

Expand Down Expand Up @@ -143,7 +143,7 @@
}
}

func collectContainerLogs(ctx context.Context, pod corev1.Pod, container corev1.Container, aboveMachinesPath string, workload framework.ClusterProxy) {

Check failure on line 146 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint (test)

func collectContainerLogs is unused (unused)

Check failure on line 146 in test/e2e/azure_clusterproxy.go

View workflow job for this annotation

GitHub Actions / lint

func collectContainerLogs is unused (unused)
defer GinkgoRecover()

podNamespace := pod.GetNamespace()
Expand Down
Loading