Skip to content

Commit d80bf0d

Browse files
authored
Merge pull request #1245 from buildkite/pdp-1828-take-over-terminate-instance-pr
2 parents 35e31db + 198db6e commit d80bf0d

File tree

4 files changed

+94
-14
lines changed

4 files changed

+94
-14
lines changed

packer/linux/conf/bin/bk-install-elastic-stack.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ echo Writing Phase 2/2 for /var/lib/buildkite-agent/cfn-env helper function...
131131
cat <<EOF >>/var/lib/buildkite-agent/cfn-env
132132
133133
set_always "BUILDKITE_AGENTS_PER_INSTANCE" "$BUILDKITE_AGENTS_PER_INSTANCE"
134+
135+
# also set via /etc/systemd/system/buildkite-agent.service.d/environment.conf
136+
set_always "BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB" "$BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB"
137+
134138
set_always "BUILDKITE_ECR_POLICY" "${BUILDKITE_ECR_POLICY:-none}"
135139
set_always "BUILDKITE_SECRETS_BUCKET" "$BUILDKITE_SECRETS_BUCKET"
136140
set_always "BUILDKITE_SECRETS_BUCKET_REGION" "$BUILDKITE_SECRETS_BUCKET_REGION"
@@ -357,6 +361,17 @@ done
357361
echo "Waited $next_wait_time times for docker to start. We will exit if it still has not started."
358362
check_docker
359363

364+
echo Writing buildkite-agent systemd environment override...
365+
# also set in /var/lib/buildkite-agent/cfn-env so that it's shown in the job logs
366+
mkdir -p /etc/systemd/system/buildkite-agent.service.d
367+
cat <<EOF | tee /etc/systemd/system/buildkite-agent.service.d/environment.conf
368+
[Service]
369+
Environment="BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB}"
370+
EOF
371+
372+
echo Reloading systemctl services...
373+
systemctl daemon-reload
374+
360375
echo Starting buildkite-agent...
361376
systemctl enable --now buildkite-agent
362377

packer/linux/conf/buildkite-agent/scripts/terminate-instance

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,51 @@
22

33
set -euo pipefail
44

5-
echo "sleeping for 10 seconds before terminating instance to allow agent logs to drain to cloudwatch..."
5+
terminate() {
6+
aws autoscaling terminate-instance-in-auto-scaling-group \
7+
--region "$1" \
8+
--instance-id "$2" \
9+
--should-decrement-desired-capacity
10+
}
11+
12+
mark_as_unhealthy() {
13+
aws autoscaling set-instance-health \
14+
--region "$1" \
15+
--instance-id "$2" \
16+
--health-status Unhealthy
17+
}
618

19+
echo "sleeping for 10 seconds before terminating instance to allow agent logs to drain to cloudwatch..."
720
sleep 10
821

9-
token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --fail --silent --show-error --location "http://169.254.169.254/latest/api/token")
10-
instance_id=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/instance-id")
11-
region=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/placement/region")
22+
token=$(
23+
curl \
24+
--fail --silent --show-error \
25+
-X PUT \
26+
-H "X-aws-ec2-metadata-token-ttl-seconds: 60" \
27+
--location "http://169.254.169.254/latest/api/token"
28+
)
29+
instance_id=$(
30+
curl \
31+
--fail --silent --show-error \
32+
-H "X-aws-ec2-metadata-token: $token" \
33+
--location "http://169.254.169.254/latest/meta-data/instance-id"
34+
)
35+
region=$(
36+
curl \
37+
--fail --silent --show-error \
38+
-H "X-aws-ec2-metadata-token: $token" \
39+
--location "http://169.254.169.254/latest/meta-data/placement/region"
40+
)
1241

1342
echo "requesting instance termination..."
14-
15-
aws autoscaling terminate-instance-in-auto-scaling-group --region "$region" --instance-id "$instance_id" "--should-decrement-desired-capacity"
43+
if [[ $BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB == "true" ]]; then
44+
# If the ASG is at the min capacity, the call to terminate-instance-in-autoscaling-group
45+
# In this case, we mark the instance as unhealthy, then the ASG will spin up a new instance
46+
# to replace it.
47+
terminate "$region" "$instance_id" || mark_as_unhealthy "$region" "$instance_id"
48+
else
49+
# If we're not in terminate-after-job mode, then it's fine for this to fail. Systemd will restart
50+
# the agent and it'll be as if the instance never got shut down.
51+
terminate "$region" "$instance_id"
52+
fi

packer/windows/conf/bin/bk-install-elastic-stack.ps1

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ function set_always() {
7272
Add-Content -Path C:\buildkite-agent\cfn-env -Value @"
7373
7474
set_always "BUILDKITE_AGENTS_PER_INSTANCE" "$Env:BUILDKITE_AGENTS_PER_INSTANCE"
75+
76+
# also set via nssm
77+
set_always "BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB" "$Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB"
78+
7579
set_always "BUILDKITE_ECR_POLICY" "$Env:BUILDKITE_ECR_POLICY"
7680
set_always "BUILDKITE_SECRETS_BUCKET" "$Env:BUILDKITE_SECRETS_BUCKET"
7781
set_always "BUILDKITE_SECRETS_BUCKET_REGION" "$Env:BUILDKITE_SECRETS_BUCKET_REGION"
@@ -146,7 +150,7 @@ tracing-backend=${Env:BUILDKITE_AGENT_TRACING_BACKEND}
146150
"@
147151
$OFS=" "
148152

149-
nssm set lifecycled AppEnvironmentExtra :AWS_REGION=$Env:AWS_REGION
153+
nssm set lifecycled AppEnvironmentExtra +AWS_REGION=$Env:AWS_REGION
150154
nssm set lifecycled AppEnvironmentExtra +LIFECYCLED_HANDLER="C:\buildkite-agent\bin\stop-agent-gracefully.ps1"
151155
Restart-Service lifecycled
152156

@@ -212,26 +216,35 @@ Write-Output "Starting the Buildkite Agent"
212216

213217
nssm install buildkite-agent C:\buildkite-agent\bin\buildkite-agent.exe start
214218
If ($lastexitcode -ne 0) { Exit $lastexitcode }
219+
215220
nssm set buildkite-agent ObjectName .\$UserName $Password
216221
If ($lastexitcode -ne 0) { Exit $lastexitcode }
222+
217223
nssm set buildkite-agent AppStdout C:\buildkite-agent\buildkite-agent.log
218224
If ($lastexitcode -ne 0) { Exit $lastexitcode }
225+
219226
nssm set buildkite-agent AppStderr C:\buildkite-agent\buildkite-agent.log
220227
If ($lastexitcode -ne 0) { Exit $lastexitcode }
221-
nssm set buildkite-agent AppEnvironmentExtra :HOME=C:\buildkite-agent
228+
229+
nssm set buildkite-agent AppEnvironmentExtra +HOME=C:\buildkite-agent
222230

223231
If ((![string]::IsNullOrEmpty($Env:BUILDKITE_ENV_FILE_URL)) -And (Test-Path -Path C:\buildkite-agent\env -PathType leaf)) {
224232
foreach ($var in Get-Content C:\buildkite-agent\env) {
225-
nssm set buildkite-agent AppEnvironmentExtra $var
233+
nssm set buildkite-agent AppEnvironmentExtra "+$var"
226234
If ($lastexitcode -ne 0) { Exit $lastexitcode }
227235
}
228236
}
229237

238+
# also set in cfn so it's show in job logs
239+
nssm set buildkite-agent AppEnvironmentExtra +BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=$Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB
230240
If ($lastexitcode -ne 0) { Exit $lastexitcode }
241+
231242
nssm set buildkite-agent AppExit Default Restart
232243
If ($lastexitcode -ne 0) { Exit $lastexitcode }
244+
233245
nssm set buildkite-agent AppRestartDelay 10000
234246
If ($lastexitcode -ne 0) { Exit $lastexitcode }
247+
235248
nssm set buildkite-agent AppEvents Exit/Post "powershell C:\buildkite-agent\bin\terminate-instance.ps1"
236249
If ($lastexitcode -ne 0) { Exit $lastexitcode }
237250

packer/windows/conf/buildkite-agent/scripts/terminate-instance.ps1

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,28 @@ $Token = (Invoke-WebRequest -UseBasicParsing -Method Put -Headers @{'X-aws-ec2-m
33
$InstanceId = (Invoke-WebRequest -UseBasicParsing -Headers @{'X-aws-ec2-metadata-token' = $Token} http://169.254.169.254/latest/meta-data/instance-id).content
44
$Region = (Invoke-WebRequest -UseBasicParsing -Headers @{'X-aws-ec2-metadata-token' = $Token} http://169.254.169.254/latest/meta-data/placement/region).content
55

6+
Write-Output "terminate-instance: disconnecting agent..."
7+
nssm stop buildkite-agent
8+
69
Write-Output "terminate-instance: requesting instance termination..."
710
aws autoscaling terminate-instance-in-auto-scaling-group --region "$Region" --instance-id "$InstanceId" "--should-decrement-desired-capacity" 2> $null
811

9-
if ($lastexitcode -eq 0) { # If autoscaling request was successful, we will terminate
10-
Write-Output "terminate-instance: disabling buildkite-agent service"
11-
nssm stop buildkite-agent
12-
}
13-
else {
12+
# If autoscaling request was successful, we will terminate the instance, otherwise, if
13+
# BuildkiteTerminateInstanceAfterJob is set to true, we will mark the instance as unhealthy
14+
# so that the ASG will terminate it despite scale-in protection. Otherwise, we should not
15+
# terminate the instance, so we need to retart the agent.
16+
if ($lastexitcode -eq 0) {
17+
Write-Output "terminate-instance: terminating instance..."
18+
} else {
1419
Write-Output "terminate-instance: ASG could not decrement (we're already at minSize)"
20+
if ($Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB -eq "true") {
21+
Write-Output "terminate-instance: marking instance as unhealthy"
22+
aws autoscaling set-instance-health `
23+
--instance-id "$InstanceId" `
24+
--region "$Region" `
25+
--health-status Unhealthy
26+
} else {
27+
Write-Output "terminate-instance: restarting agent..."
28+
nssm start buildkite-agent
29+
}
1530
}

0 commit comments

Comments
 (0)