Skip to content

Commit 896ff1f

Browse files
author
sivakami
committed
Add SwiftV2 long-running pipeline with scheduled tests
- Implemented scheduled pipeline running every 1 hour with persistent infrastructure - Split test execution into 2 jobs: Create (with 20min wait) and Delete - Added 8 test scenarios across 2 AKS clusters, 4 VNets, different subnets - Implemented two-phase deletion strategy to prevent PNI ReservationInUse errors - Added context timeouts on kubectl commands with force delete fallbacks - Resource naming uses RG name as BUILD_ID for uniqueness across parallel setups - Added SkipAutoDeleteTill tags to prevent automatic resource cleanup - Conditional setup stages controlled by runSetupStages parameter - Auto-generate RG name from location or allow custom names for parallel setups - Added comprehensive README with setup instructions and troubleshooting - Node selection by agentpool labels with usage tracking to prevent conflicts - Kubernetes naming compliance (RFC 1123) for all resources fix ginkgo flag. Add datapath tests. Delete old test file. Add testcases for provate endpoint. Ginkgo run specs only on specified files. update pipeline params. Add ginkgo tags Add datapath tests. Add ginkgo build tags. remove wait time. set namespace. update pod image. Add more nsg rules to block subnets s1 and s2 test change. Change delegated subnet address range. Use delegated interface for network connectivity tests. Datapath test between clusters. test. test private endpoints. fix private endpoint tests. Set storage account names in putput var. set storage account name. fix pn names. update pe update pe test. update sas token generation. Add node labels for sw2 scenario, cleanup pods on any test failure. enable nsg tests. update storage. Add rules to nsg. disable private endpoint negative test. disable public network access on storage account with private endpoint. wait for default nsg to be created. disable negative test on private endpoint. private endpoint depends on aks cluster vnets, change pipeline job dependencies. Add node labels for each workload type and nic capacity. make sku constant. Update readme, set schedule for long running cluster on test branch.
1 parent b9f406a commit 896ff1f

File tree

20 files changed

+3164
-209
lines changed

20 files changed

+3164
-209
lines changed

.pipelines/swiftv2-long-running/README.md

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,47 @@
11
trigger: none
2+
pr: none
3+
4+
# Schedule: Run every 1 hour
5+
schedules:
6+
- cron: "0 */3 * * *" # Every 3 hours at minute 0
7+
displayName: "Run tests every 3 hours"
8+
branches:
9+
include:
10+
- sv2-long-running-pipeline-stage2
11+
always: true # Run even if there are no code changes
212

313
parameters:
414
- name: subscriptionId
515
displayName: "Azure Subscription ID"
616
type: string
717
default: "37deca37-c375-4a14-b90a-043849bd2bf1"
818

19+
- name: serviceConnection
20+
displayName: "Azure Service Connection"
21+
type: string
22+
default: "Azure Container Networking - Standalone Test Service Connection"
23+
924
- name: location
1025
displayName: "Deployment Region"
1126
type: string
1227
default: "centraluseuap"
1328

14-
- name: resourceGroupName
15-
displayName: "Resource Group Name"
16-
type: string
17-
default: "long-run-$(Build.BuildId)"
18-
19-
- name: vmSkuDefault
20-
displayName: "VM SKU for Default Node Pool"
21-
type: string
22-
default: "Standard_D2s_v3"
23-
24-
- name: vmSkuHighNIC
25-
displayName: "VM SKU for High NIC Node Pool"
26-
type: string
27-
default: "Standard_D16s_v3"
29+
- name: runSetupStages
30+
displayName: "Create New Infrastructure Setup"
31+
type: boolean
32+
default: false
2833

29-
- name: serviceConnection
30-
displayName: "Azure Service Connection"
34+
# Setup-only parameters (only used when runSetupStages=true)
35+
- name: resourceGroupName
36+
displayName: "Resource Group Name used when Create new Infrastructure Setup is selected"
3137
type: string
32-
default: "Azure Container Networking - Standalone Test Service Connection"
38+
default: "sv2-long-run-$(Build.BuildId)"
3339

3440
extends:
3541
template: template/long-running-pipeline-template.yaml
3642
parameters:
3743
subscriptionId: ${{ parameters.subscriptionId }}
3844
location: ${{ parameters.location }}
3945
resourceGroupName: ${{ parameters.resourceGroupName }}
40-
vmSkuDefault: ${{ parameters.vmSkuDefault }}
41-
vmSkuHighNIC: ${{ parameters.vmSkuHighNIC }}
4246
serviceConnection: ${{ parameters.serviceConnection }}
47+
runSetupStages: ${{ parameters.runSetupStages }}

.pipelines/swiftv2-long-running/scripts/create_aks.sh

Lines changed: 100 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7,57 +7,113 @@ RG=$3
77
VM_SKU_DEFAULT=$4
88
VM_SKU_HIGHNIC=$5
99

10-
CLUSTER_COUNT=2
11-
CLUSTER_PREFIX="aks"
12-
DEFAULT_NODE_COUNT=1
13-
COMMON_TAGS="fastpathenabled=true RGOwner=LongRunningTestPipelines stampcreatorserviceinfo=true"
14-
15-
wait_for_provisioning() { # Helper for safe retry/wait for provisioning states (basic)
16-
local rg="$1" clusterName="$2"
17-
echo "Waiting for AKS '$clusterName' in RG '$rg' to reach Succeeded/Failed (polling)..."
10+
CLUSTER_COUNT=2
11+
CLUSTER_PREFIX="aks"
12+
13+
14+
stamp_vnet() {
15+
local vnet_id="$1"
16+
17+
responseFile="response.txt"
18+
modified_vnet="${vnet_id//\//%2F}"
19+
cmd_stamp_curl="'curl -v -X PUT http://localhost:8080/VirtualNetwork/$modified_vnet/stampcreatorservicename'"
20+
cmd_containerapp_exec="az containerapp exec -n subnetdelegator-westus-u3h4j -g subnetdelegator-westus --subscription 9b8218f9-902a-4d20-a65c-e98acec5362f --command $cmd_stamp_curl"
21+
22+
max_retries=10
23+
sleep_seconds=15
24+
retry_count=0
25+
26+
while [[ $retry_count -lt $max_retries ]]; do
27+
script --quiet -c "$cmd_containerapp_exec" "$responseFile"
28+
if grep -qF "200 OK" "$responseFile"; then
29+
echo "Subnet Delegator successfully stamped the vnet"
30+
return 0
31+
else
32+
echo "Subnet Delegator failed to stamp the vnet, attempt $((retry_count+1))"
33+
cat "$responseFile"
34+
retry_count=$((retry_count+1))
35+
sleep "$sleep_seconds"
36+
fi
37+
done
38+
39+
echo "Failed to stamp the vnet even after $max_retries attempts"
40+
exit 1
41+
}
42+
43+
wait_for_provisioning() {
44+
local rg="$1" clusterName="$2"
45+
echo "Waiting for AKS '$clusterName' in RG '$rg'..."
1846
while :; do
1947
state=$(az aks show --resource-group "$rg" --name "$clusterName" --query provisioningState -o tsv 2>/dev/null || true)
20-
if [ -z "$state" ]; then
21-
sleep 3
22-
continue
48+
if [[ "$state" =~ Succeeded ]]; then
49+
echo "Provisioning state: $state"
50+
break
2351
fi
24-
case "$state" in
25-
Succeeded|Succeeded*) echo "Provisioning state: $state"; break ;;
26-
Failed|Canceled|Rejected) echo "Provisioning finished with state: $state"; break ;;
27-
*) printf "."; sleep 6 ;;
28-
esac
52+
if [[ "$state" =~ Failed|Canceled ]]; then
53+
echo "Provisioning finished with state: $state"
54+
break
55+
fi
56+
sleep 6
2957
done
3058
}
3159

3260

61+
#########################################
62+
# Main script starts here
63+
#########################################
64+
3365
for i in $(seq 1 "$CLUSTER_COUNT"); do
34-
echo "=============================="
35-
echo " Working on cluster set #$i"
36-
echo "=============================="
37-
38-
CLUSTER_NAME="${CLUSTER_PREFIX}-${i}"
39-
echo "Creating AKS cluster '$CLUSTER_NAME' in RG '$RG'"
40-
41-
make -C ./hack/aks azcfg AZCLI=az REGION=$LOCATION
42-
43-
make -C ./hack/aks swiftv2-podsubnet-cluster-up \
44-
AZCLI=az REGION=$LOCATION \
45-
SUB=$SUBSCRIPTION_ID \
46-
GROUP=$RG \
47-
CLUSTER=$CLUSTER_NAME \
48-
NODE_COUNT=$DEFAULT_NODE_COUNT \
49-
VM_SIZE=$VM_SKU_DEFAULT \
50-
51-
echo " - waiting for AKS provisioning state..."
52-
wait_for_provisioning "$RG" "$CLUSTER_NAME"
53-
54-
echo "Adding multi-tenant nodepool ' to '$CLUSTER_NAME'"
55-
make -C ./hack/aks linux-swiftv2-nodepool-up \
56-
AZCLI=az REGION=$LOCATION \
57-
GROUP=$RG \
58-
VM_SIZE=$VM_SKU_HIGHNIC \
59-
CLUSTER=$CLUSTER_NAME \
60-
SUB=$SUBSCRIPTION_ID \
66+
echo "Creating cluster #$i..."
6167

68+
CLUSTER_NAME="${CLUSTER_PREFIX}-${i}"
69+
70+
make -C ./hack/aks azcfg AZCLI=az REGION=$LOCATION
71+
72+
# Create cluster with SkipAutoDeleteTill tag for persistent infrastructure
73+
make -C ./hack/aks swiftv2-podsubnet-cluster-up \
74+
AZCLI=az REGION=$LOCATION \
75+
SUB=$SUBSCRIPTION_ID \
76+
GROUP=$RG \
77+
CLUSTER=$CLUSTER_NAME \
78+
VM_SIZE=$VM_SKU_DEFAULT
79+
80+
# Add SkipAutoDeleteTill tag to cluster (2032-12-31 for long-term persistence)
81+
az aks update -g "$RG" -n "$CLUSTER_NAME" --tags SkipAutoDeleteTill=2032-12-31 || echo "Warning: Failed to add tag to cluster"
82+
83+
wait_for_provisioning "$RG" "$CLUSTER_NAME"
84+
85+
vnet_id=$(az network vnet show -g "$RG" --name "$CLUSTER_NAME" --query id -o tsv)
86+
echo "Found VNET: $vnet_id"
87+
88+
# Add SkipAutoDeleteTill tag to AKS VNet
89+
az network vnet update --ids "$vnet_id" --set tags.SkipAutoDeleteTill=2032-12-31 || echo "Warning: Failed to add tag to vnet"
90+
91+
stamp_vnet "$vnet_id"
92+
93+
make -C ./hack/aks linux-swiftv2-nodepool-up \
94+
AZCLI=az REGION=$LOCATION \
95+
GROUP=$RG \
96+
VM_SIZE=$VM_SKU_HIGHNIC \
97+
CLUSTER=$CLUSTER_NAME \
98+
SUB=$SUBSCRIPTION_ID
99+
100+
az aks get-credentials -g "$RG" -n "$CLUSTER_NAME" --admin --overwrite-existing \
101+
--file "/tmp/${CLUSTER_NAME}.kubeconfig"
102+
103+
# Label all nodes with workload-type and nic-capacity labels
104+
echo "==> Labeling all nodes in $CLUSTER_NAME with workload-type=swiftv2-linux"
105+
kubectl --kubeconfig "/tmp/${CLUSTER_NAME}.kubeconfig" label nodes --all workload-type=swiftv2-linux --overwrite
106+
echo "[OK] All nodes labeled with workload-type=swiftv2-linux"
107+
108+
# Label default nodepool (nodepool1) with low-nic capacity
109+
echo "==> Labeling default nodepool (nodepool1) nodes with nic-capacity=low-nic"
110+
kubectl --kubeconfig "/tmp/${CLUSTER_NAME}.kubeconfig" label nodes -l agentpool=nodepool1 nic-capacity=low-nic --overwrite
111+
echo "[OK] Default nodepool nodes labeled with nic-capacity=low-nic"
112+
113+
# Label nplinux nodepool with high-nic capacity
114+
echo "==> Labeling nplinux nodepool nodes with nic-capacity=high-nic"
115+
kubectl --kubeconfig "/tmp/${CLUSTER_NAME}.kubeconfig" label nodes -l agentpool=nplinux nic-capacity=high-nic --overwrite
116+
echo "[OK] nplinux nodepool nodes labeled with nic-capacity=high-nic"
62117
done
63-
echo "All done. Created $CLUSTER_COUNT cluster set(s)."
118+
119+
echo "All clusters complete."

0 commit comments

Comments
 (0)