Skip to content

Commit bbfafa7

Browse files
committed
Another try of fixing the workflow
1 parent 704572b commit bbfafa7

File tree

2 files changed

+82
-17
lines changed

2 files changed

+82
-17
lines changed

.github/workflows/deploy-azure.yml

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,46 +101,105 @@ jobs:
101101
fi
102102
103103
################################################################
104-
# 2.5 · Reconcile AKS cluster if it exists
104+
# 2.5 · Check for ongoing operations and abort/wait
105105
################################################################
106-
- name: Reconcile AKS cluster
106+
- name: Check AKS operations and abort if needed
107107
run: |
108108
set -euo pipefail
109109
RG="rg-aks-memgraph-${{ env.ENVIRONMENT }}"
110110
AKS="aks-memgraph-${{ env.ENVIRONMENT }}"
111111
112112
if az aks show --resource-group "$RG" --name "$AKS" &>/dev/null; then
113-
echo "🔄 Reconciling AKS cluster to fix node resource group..."
114-
az aks update --resource-group "$RG" --name "$AKS" --no-wait
113+
# Check for ongoing operations
114+
echo "🔍 Checking for ongoing AKS operations..."
115+
ONGOING_OPS=$(az aks operation list --resource-group "$RG" --name "$AKS" --query "[?status=='Running' || status=='Pending']" -o json)
115116
116-
echo "⏳ Waiting for reconciliation to complete..."
117+
if [[ $(echo "$ONGOING_OPS" | jq 'length') -gt 0 ]]; then
118+
echo "⚠️ Found ongoing operations. Attempting to abort them..."
119+
OPERATION_IDS=$(echo "$ONGOING_OPS" | jq -r '.[].operationId')
120+
121+
for OP_ID in $OPERATION_IDS; do
122+
echo "🛑 Aborting operation $OP_ID..."
123+
az aks operation-abort --resource-group "$RG" --name "$AKS" --operation-id "$OP_ID" || echo "⚠️ Could not abort operation, may require manual intervention"
124+
done
125+
126+
# Wait for all operations to complete or be aborted
127+
echo "⏳ Waiting for operations to complete or be aborted..."
128+
max_retries=20
129+
retry_count=0
130+
while [ $retry_count -lt $max_retries ]; do
131+
ONGOING_COUNT=$(az aks operation list --resource-group "$RG" --name "$AKS" --query "length([?status=='Running' || status=='Pending'])" -o tsv)
132+
if [ "$ONGOING_COUNT" -eq 0 ]; then
133+
echo "✅ No more ongoing operations"
134+
break
135+
fi
136+
echo "⏳ Still have $ONGOING_COUNT ongoing operations, waiting 15s... ($(($retry_count + 1))/$max_retries)"
137+
sleep 15
138+
retry_count=$((retry_count + 1))
139+
done
140+
141+
if [ $retry_count -eq $max_retries ]; then
142+
echo "::warning::Timed out waiting for operations to complete. This might cause Terraform to fail."
143+
fi
144+
else
145+
echo "✅ No ongoing operations found"
146+
fi
147+
148+
# Wait for the cluster to be in a Succeeded provisioning state
149+
echo "⏳ Ensuring cluster is in a Succeeded state..."
117150
max_retries=10
118151
retry_count=0
119152
while [ $retry_count -lt $max_retries ]; do
120-
if az aks show --resource-group "$RG" --name "$AKS" --query "provisioningState" -o tsv | grep -q "Succeeded"; then
121-
echo "✅ AKS reconciliation completed successfully"
153+
PROVISION_STATE=$(az aks show --resource-group "$RG" --name "$AKS" --query "provisioningState" -o tsv)
154+
if [ "$PROVISION_STATE" = "Succeeded" ]; then
155+
echo "✅ AKS cluster is in Succeeded state"
122156
break
123157
fi
124-
echo "⏳ AKS still reconciling, waiting 30s... ($(($retry_count + 1))/$max_retries)"
158+
echo "⏳ AKS state: $PROVISION_STATE, waiting 30s... ($(($retry_count + 1))/$max_retries)"
125159
sleep 30
126160
retry_count=$((retry_count + 1))
127161
done
128162
129163
if [ $retry_count -eq $max_retries ]; then
130-
echo "⚠️ AKS reconciliation timed out but continuing with deployment"
164+
echo "::warning::Timed out waiting for AKS to reach Succeeded state. This might cause Terraform to fail."
131165
fi
132166
else
133-
echo "⏭️ No existing AKS cluster to reconcile"
167+
echo "⏭️ No existing AKS cluster to check"
134168
fi
135169
136170
################################################################
137-
# 3 · Terraform apply
171+
# 3 · Terraform apply (with retries)
138172
################################################################
139173
- name: Terraform apply
140174
env: { TF_IN_AUTOMATION: true }
141175
run: |
142-
terraform -chdir="$TF_DIR" apply -auto-approve -input=false -no-color \
143-
-var="node_vm_size=${{ env.NODE_VM_SIZE }}"
176+
max_retries=5
177+
retry_count=0
178+
179+
while [ $retry_count -lt $max_retries ]; do
180+
echo "🔄 Running Terraform apply (attempt $(($retry_count + 1))/$max_retries)..."
181+
182+
if terraform -chdir="$TF_DIR" apply -auto-approve -input=false -no-color \
183+
-var="node_vm_size=${{ env.NODE_VM_SIZE }}"; then
184+
echo "✅ Terraform apply succeeded!"
185+
exit 0
186+
fi
187+
188+
exit_code=$?
189+
echo "⚠️ Terraform apply failed with exit code $exit_code"
190+
191+
# Check if this is the last attempt
192+
if [ $(($retry_count + 1)) -eq $max_retries ]; then
193+
echo "❌ All retry attempts exhausted"
194+
exit $exit_code
195+
fi
196+
197+
# Wait before retrying
198+
retry_count=$((retry_count + 1))
199+
wait_time=$((30 * retry_count))
200+
echo "⏳ Waiting ${wait_time}s before retry..."
201+
sleep $wait_time
202+
done
144203
145204
################################################################
146205
# 4 · kubeconfig

infra/azure/main.tf

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,17 @@ resource "azurerm_kubernetes_cluster" "this" {
5151
name = "default"
5252
node_count = 1
5353
vm_size = var.node_vm_size
54+
55+
# Use lifecycle to ignore changes to upgrade_settings to prevent conflicts
56+
# This lets Azure manage these settings instead of forcing our values
57+
}
5458

55-
# Explicitly define upgrade settings to match Azure defaults
56-
upgrade_settings {
57-
max_surge = "33%"
58-
}
59+
# Prevent Terraform from modifying certain attributes that might cause conflicts
60+
lifecycle {
61+
ignore_changes = [
62+
default_node_pool[0].upgrade_settings,
63+
tags
64+
]
5965
}
6066

6167
identity { type = "SystemAssigned" }

0 commit comments

Comments
 (0)