Skip to content

Commit 82fada0

Browse files
committed
...
1 parent bbfafa7 commit 82fada0

File tree

2 files changed

+62
-40
lines changed

2 files changed

+62
-40
lines changed

.github/workflows/deploy-azure.yml

Lines changed: 52 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -101,75 +101,88 @@ jobs:
101101
fi
102102
103103
################################################################
104-
# 2.5 · Check for ongoing operations and abort/wait
104+
# 2.5 · Wait for AKS to be in a stable state
105105
################################################################
106-
- name: Check AKS operations and abort if needed
106+
- name: Wait for AKS stability
107107
run: |
108108
set -euo pipefail
109109
RG="rg-aks-memgraph-${{ env.ENVIRONMENT }}"
110110
AKS="aks-memgraph-${{ env.ENVIRONMENT }}"
111111
112112
if az aks show --resource-group "$RG" --name "$AKS" &>/dev/null; then
113-
# Check for ongoing operations
114-
echo "🔍 Checking for ongoing AKS operations..."
115-
ONGOING_OPS=$(az aks operation list --resource-group "$RG" --name "$AKS" --query "[?status=='Running' || status=='Pending']" -o json)
116-
117-
if [[ $(echo "$ONGOING_OPS" | jq 'length') -gt 0 ]]; then
118-
echo "⚠️ Found ongoing operations. Attempting to abort them..."
119-
OPERATION_IDS=$(echo "$ONGOING_OPS" | jq -r '.[].operationId')
120-
121-
for OP_ID in $OPERATION_IDS; do
122-
echo "🛑 Aborting operation $OP_ID..."
123-
az aks operation-abort --resource-group "$RG" --name "$AKS" --operation-id "$OP_ID" || echo "⚠️ Could not abort operation, may require manual intervention"
124-
done
125-
126-
# Wait for all operations to complete or be aborted
127-
echo "⏳ Waiting for operations to complete or be aborted..."
128-
max_retries=20
129-
retry_count=0
130-
while [ $retry_count -lt $max_retries ]; do
131-
ONGOING_COUNT=$(az aks operation list --resource-group "$RG" --name "$AKS" --query "length([?status=='Running' || status=='Pending'])" -o tsv)
132-
if [ "$ONGOING_COUNT" -eq 0 ]; then
133-
echo "✅ No more ongoing operations"
134-
break
135-
fi
136-
echo "⏳ Still have $ONGOING_COUNT ongoing operations, waiting 15s... ($(($retry_count + 1))/$max_retries)"
137-
sleep 15
138-
retry_count=$((retry_count + 1))
139-
done
140-
141-
if [ $retry_count -eq $max_retries ]; then
142-
echo "::warning::Timed out waiting for operations to complete. This might cause Terraform to fail."
143-
fi
144-
else
145-
echo "✅ No ongoing operations found"
146-
fi
113+
echo "🔍 Checking AKS cluster state..."
147114
148115
# Wait for the cluster to be in a Succeeded provisioning state
149-
echo "⏳ Ensuring cluster is in a Succeeded state..."
150-
max_retries=10
116+
echo "⏳ Waiting for cluster to be in a stable state..."
117+
max_retries=15
151118
retry_count=0
119+
152120
while [ $retry_count -lt $max_retries ]; do
153121
PROVISION_STATE=$(az aks show --resource-group "$RG" --name "$AKS" --query "provisioningState" -o tsv)
122+
154123
if [ "$PROVISION_STATE" = "Succeeded" ]; then
155124
echo "✅ AKS cluster is in Succeeded state"
125+
# Force a small delay to ensure all backend operations are truly complete
126+
sleep 30
156127
break
157128
fi
129+
158130
echo "⏳ AKS state: $PROVISION_STATE, waiting 30s... ($(($retry_count + 1))/$max_retries)"
159131
sleep 30
160132
retry_count=$((retry_count + 1))
161133
done
162134
163135
if [ $retry_count -eq $max_retries ]; then
164-
echo "::warning::Timed out waiting for AKS to reach Succeeded state. This might cause Terraform to fail."
136+
echo "::warning::Timed out waiting for AKS to reach Succeeded state."
137+
echo "⚠️ Attempting to proceed anyway, but Terraform might fail."
165138
fi
139+
140+
# Display detailed cluster information for debugging
141+
echo "📊 Current AKS cluster status:"
142+
az aks show --resource-group "$RG" --name "$AKS" --query "{provisioningState:provisioningState,powerState:powerState,kubernetesVersion:kubernetesVersion,nodeResourceGroup:nodeResourceGroup}" -o table || true
143+
144+
# This pause gives the system time to finalize any background operations
145+
echo "⏱️ Allowing 60 seconds of stabilization time before proceeding..."
146+
sleep 60
166147
else
167148
echo "⏭️ No existing AKS cluster to check"
168149
fi
169150
170151
################################################################
171152
# 3 · Terraform apply (with retries)
172153
################################################################
154+
- name: Reconcile node resource group if needed
155+
run: |
156+
set -euo pipefail
157+
RG="rg-aks-memgraph-${{ env.ENVIRONMENT }}"
158+
AKS="aks-memgraph-${{ env.ENVIRONMENT }}"
159+
160+
if az aks show --resource-group "$RG" --name "$AKS" &>/dev/null; then
161+
# Check if the node resource group exists
162+
NODE_RG=$(az aks show --resource-group "$RG" --name "$AKS" --query "nodeResourceGroup" -o tsv)
163+
echo "🔍 Node resource group is: $NODE_RG"
164+
165+
if ! az group exists --name "$NODE_RG" | grep -q true; then
166+
echo "⚠️ Node resource group $NODE_RG doesn't exist, attempting to reconcile..."
167+
az aks update --resource-group "$RG" --name "$AKS" --yes || echo "⚠️ Reconciliation failed, proceeding anyway"
168+
169+
# Wait for any update operation to complete
170+
echo "⏱️ Waiting 2 minutes for update operation to complete..."
171+
sleep 120
172+
173+
# Check again
174+
if ! az group exists --name "$NODE_RG" | grep -q true; then
175+
echo "::warning::Node resource group still doesn't exist after reconciliation attempt"
176+
else
177+
echo "✅ Node resource group now exists"
178+
fi
179+
else
180+
echo "✅ Node resource group exists"
181+
fi
182+
else
183+
echo "⏭️ No existing AKS cluster to reconcile"
184+
fi
185+
173186
- name: Terraform apply
174187
env: { TF_IN_AUTOMATION: true }
175188
run: |

infra/azure/main.tf

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,17 @@ resource "azurerm_kubernetes_cluster" "this" {
5959
# Prevent Terraform from modifying certain attributes that might cause conflicts
6060
lifecycle {
6161
ignore_changes = [
62+
# Ignore all node pool settings that Azure might modify internally
6263
default_node_pool[0].upgrade_settings,
63-
tags
64+
default_node_pool[0].orchestrator_version,
65+
default_node_pool[0].node_labels,
66+
default_node_pool[0].node_taints,
67+
# Ignore other common attributes that cause drift
68+
kubernetes_version,
69+
sku_tier,
70+
tags,
71+
# Azure sometimes modifies these internal networking settings
72+
network_profile
6473
]
6574
}
6675

0 commit comments

Comments
 (0)