Skip to content

Commit 5d7feed

Browse files
authored
Add Retry Mechanism to E2E EC2 Terraform Deployment (#635)
* Add Retry Mechanism to E2E EC2 Terraform Deployment * Add Extra Comments * Refactor code
1 parent efb16c6 commit 5d7feed

File tree

2 files changed

+64
-26
lines changed

2 files changed

+64
-26
lines changed

.github/workflows/appsignals-e2e-ec2-test.yml

Lines changed: 63 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,72 @@ jobs:
5656
with:
5757
terraform_wrapper: false
5858

59-
- name: Deploy sample app via terraform
59+
- name: Deploy sample app via terraform and wait for endpoint to come online
6060
working-directory: testing/terraform/ec2
6161
run: |
6262
terraform init
6363
terraform validate
64-
terraform apply -auto-approve \
65-
-var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
66-
-var="test_id=${{ env.TESTING_ID }}" \
67-
-var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \
68-
-var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \
69-
-var="cw_agent_rpm=${{ env.APP_SIGNALS_CW_AGENT_RPM }}" \
70-
-var="adot_jar=${{ env.APP_SIGNALS_ADOT_JAR }}"
64+
65+
# Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online.
66+
# There may be occasional failures due to transitivity issues, so try up to 2 times.
67+
# deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
68+
# that it failed at some point
69+
retry_counter=0
70+
max_retry=2
71+
while [ $retry_counter -lt $max_retry ]; do
72+
echo "Attempt $retry_counter"
73+
deployment_failed=0
74+
terraform apply -auto-approve \
75+
-var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
76+
-var="test_id=${{ env.TESTING_ID }}" \
77+
-var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \
78+
-var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \
79+
-var="cw_agent_rpm=${{ env.APP_SIGNALS_CW_AGENT_RPM }}" \
80+
-var="adot_jar=${{ env.APP_SIGNALS_ADOT_JAR }}" \
81+
|| deployment_failed=$?
82+
83+
if [ $deployment_failed -eq 1 ]; then
84+
echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
85+
fi
86+
87+
# If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint.
88+
# Attempts to connect will be made for up to 10 minutes
89+
if [ $deployment_failed -eq 0 ]; then
90+
echo "Attempting to connect to the endpoint"
91+
sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080
92+
attempt_counter=0
93+
max_attempts=60
94+
until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
95+
if [ ${attempt_counter} -eq ${max_attempts} ];then
96+
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
97+
deployment_failed=1
98+
break
99+
fi
100+
101+
printf '.'
102+
attempt_counter=$(($attempt_counter+1))
103+
sleep 10
104+
done
105+
fi
106+
107+
# If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
108+
# resources created from terraform and try again.
109+
if [ $deployment_failed -eq 1 ]; then
110+
echo "Destroying terraform"
111+
terraform destroy -auto-approve \
112+
-var="test_id=${{ env.TESTING_ID }}"
113+
114+
retry_counter=$(($retry_counter+1))
115+
else
116+
# If deployment succeeded, then exit the loop
117+
break
118+
fi
119+
120+
if [ $retry_counter -eq $max_retry ]; then
121+
echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code"
122+
exit 1
123+
fi
124+
done
71125
72126
- name: Get the ec2 instance ami id
73127
run: |
@@ -80,22 +134,6 @@ jobs:
80134
echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_public_ip)" >> $GITHUB_ENV
81135
working-directory: testing/terraform/ec2
82136

83-
- name: Wait for app endpoint to come online
84-
id: endpoint-check
85-
run: |
86-
attempt_counter=0
87-
max_attempts=30
88-
until $(curl --output /dev/null --silent --head --fail http://${{ env.MAIN_SERVICE_ENDPOINT }}); do
89-
if [ ${attempt_counter} -eq ${max_attempts} ];then
90-
echo "Max attempts reached"
91-
exit 1
92-
fi
93-
94-
printf '.'
95-
attempt_counter=$(($attempt_counter+1))
96-
sleep 10
97-
done
98-
99137
# This steps increases the speed of the validation by creating the telemetry data in advance
100138
- name: Call all test APIs
101139
continue-on-error: true
@@ -182,4 +220,4 @@ jobs:
182220
working-directory: testing/terraform/ec2
183221
run: |
184222
terraform destroy -auto-approve \
185-
-var="test_id=${{ env.TESTING_ID }}"
223+
-var="test_id=${{ env.TESTING_ID }}"

.github/workflows/appsignals-e2e-eks-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,4 +332,4 @@ jobs:
332332
--name service-account-${{ env.TESTING_ID }} \
333333
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
334334
--cluster ${{ inputs.test-cluster-name }} \
335-
--region ${{ env.AWS_DEFAULT_REGION }}
335+
--region ${{ env.AWS_DEFAULT_REGION }}

0 commit comments

Comments
 (0)