Skip to content

Commit 6059133

Browse files
committed
more robust MAX_LIFETIME_MINUTES shutdown logic
some shared-functions factoring
1 parent 01dd723 commit 6059133

File tree

3 files changed

+236
-115
lines changed

3 files changed

+236
-115
lines changed

src/ec2_gha/templates/shared-functions.sh

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,36 +6,39 @@
66
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/runner-setup.log; }
77
log_error() { log "ERROR: $1" >&2; }
88

9+
dn=/dev/null
10+
911
# Wait for dpkg lock to be released (for Debian/Ubuntu systems)
1012
wait_for_dpkg_lock() {
11-
local timeout=120
12-
while fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1; do
13-
if [ $timeout -le 0 ]; then
14-
log "WARNING: dpkg lock timeout, proceeding anyway"
13+
local t=120
14+
local L=/var/lib/dpkg/lock
15+
while fuser $L-frontend >$dn 2>&1 || fuser $L >$dn 2>&1; do
16+
if [ $t -le 0 ]; then
17+
log "WARNING: dpkg lock t, proceeding anyway"
1518
break
1619
fi
17-
log "dpkg is locked, waiting... ($timeout seconds remaining)"
20+
log "dpkg is locked, waiting... ($t seconds remaining)"
1821
sleep 5
19-
timeout=$((timeout - 5))
22+
t=$((t - 5))
2023
done
2124
}
2225

2326
# Function to flush CloudWatch logs before shutdown
2427
flush_cloudwatch_logs() {
2528
log "Stopping CloudWatch agent to flush logs"
2629
if systemctl is-active --quiet amazon-cloudwatch-agent; then
27-
systemctl stop amazon-cloudwatch-agent 2>/dev/null || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a stop -m ec2 2>/dev/null || true
30+
systemctl stop amazon-cloudwatch-agent 2>$dn || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a stop -m ec2 2>$dn || true
2831
fi
2932
}
3033

3134
# Get EC2 instance metadata (IMDSv2 compatible)
3235
get_metadata() {
3336
local path="$1"
34-
local token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 300" http://169.254.169.254/latest/api/token 2>/dev/null || true)
37+
local token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 300" http://169.254.169.254/latest/api/token 2>$dn || true)
3538
if [ -n "$token" ]; then
36-
curl -s -H "X-aws-ec2-metadata-token: $token" "http://169.254.169.254/latest/meta-data/$path" 2>/dev/null || echo "unknown"
39+
curl -s -H "X-aws-ec2-metadata-token: $token" "http://169.254.169.254/latest/meta-data/$path" 2>$dn || echo "unknown"
3740
else
38-
curl -s "http://169.254.169.254/latest/meta-data/$path" 2>/dev/null || echo "unknown"
41+
curl -s "http://169.254.169.254/latest/meta-data/$path" 2>$dn || echo "unknown"
3942
fi
4043
return 0 # Always return success to avoid set -e issues
4144
}
@@ -46,7 +49,7 @@ deregister_all_runners() {
4649
if [ -d "$RUNNER_DIR" ] && [ -f "$RUNNER_DIR/config.sh" ]; then
4750
log "Deregistering runner in $RUNNER_DIR"
4851
cd "$RUNNER_DIR"
49-
pkill -INT -f "$RUNNER_DIR/run.sh" 2>/dev/null || true
52+
pkill -INT -f "$RUNNER_DIR/run.sh" 2>$dn || true
5053
sleep 1
5154
if [ -f "$RUNNER_DIR/.runner-token" ]; then
5255
TOKEN=$(cat "$RUNNER_DIR/.runner-token")
@@ -62,7 +65,7 @@ debug_sleep_and_shutdown() {
6265
if [ "$debug" = "true" ] || [ "$debug" = "True" ] || [ "$debug" = "1" ]; then
6366
log "Debug: Sleeping 600s before shutdown..."
6467
# Detect the SSH user from the home directory
65-
local ssh_user=$(basename "$homedir" 2>/dev/null || echo "ec2-user")
68+
local ssh_user=$(basename "$homedir" 2>$dn || echo "ec2-user")
6669
local public_ip=$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4)
6770
log "SSH into instance with: ssh ${ssh_user}@${public_ip}"
6871
log "Then check: /var/log/runner-setup.log and /var/log/runner-debug.log"
@@ -121,24 +124,24 @@ configure_runner() {
121124
# Install dependencies if needed
122125
if [ -f ./bin/installdependencies.sh ]; then
123126
# Quick check for common AMIs with pre-installed deps
124-
if command -v dpkg >/dev/null 2>&1 && dpkg -l libicu[0-9]* 2>/dev/null | grep -q ^ii; then
127+
if command -v dpkg >$dn 2>&1 && dpkg -l libicu[0-9]* 2>$dn | grep -q ^ii; then
125128
log "Dependencies exist, skipping install"
126129
else
127130
log "Installing dependencies..."
128131
set +e
129-
sudo ./bin/installdependencies.sh >/dev/null 2>&1
132+
sudo ./bin/installdependencies.sh >$dn 2>&1
130133
local deps_result=$?
131134
set -e
132135
if [ $deps_result -ne 0 ]; then
133136
log "Dependencies script failed, installing manually..."
134-
if command -v dnf >/dev/null 2>&1; then
135-
sudo dnf install -y libicu lttng-ust >/dev/null 2>&1 || true
136-
elif command -v yum >/dev/null 2>&1; then
137-
sudo yum install -y libicu >/dev/null 2>&1 || true
138-
elif command -v apt-get >/dev/null 2>&1; then
137+
if command -v dnf >$dn 2>&1; then
138+
sudo dnf install -y libicu lttng-ust >$dn 2>&1 || true
139+
elif command -v yum >$dn 2>&1; then
140+
sudo yum install -y libicu >$dn 2>&1 || true
141+
elif command -v apt-get >$dn 2>&1; then
139142
wait_for_dpkg_lock
140-
sudo apt-get update >/dev/null 2>&1 || true
141-
sudo apt-get install -y libicu-dev >/dev/null 2>&1 || true
143+
sudo apt-get update >$dn 2>&1 || true
144+
sudo apt-get install -y libicu-dev >$dn 2>&1 || true
142145
fi
143146
fi
144147
fi
@@ -169,7 +172,7 @@ EOF
169172
fi
170173

171174
# Start runner in background
172-
RUNNER_ALLOW_RUNASROOT=1 nohup ./run.sh > /dev/null 2>&1 &
175+
RUNNER_ALLOW_RUNASROOT=1 nohup ./run.sh > $dn 2>&1 &
173176
local pid=$!
174177
log "Started runner $idx in $runner_dir (PID: $pid)"
175178

src/ec2_gha/templates/user-script.sh.templ

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,25 @@ log "Instance metadata: Type=$${INSTANCE_TYPE} ID=$${INSTANCE_ID} Region=$${REGI
116116
# Set up maximum lifetime timeout - instance will terminate after this time regardless of job status
117117
MAX_LIFETIME_MINUTES=$max_instance_lifetime
118118
log "Setting up maximum lifetime timeout: $${MAX_LIFETIME_MINUTES} minutes"
119-
nohup bash -c "sleep $${MAX_LIFETIME_MINUTES}m && echo '[$$(date)] Maximum lifetime reached' && shutdown -h now" > /var/log/max-lifetime.log 2>&1 &
119+
# Use ; instead of && so shutdown runs even if echo fails (e.g., disk full)
120+
# Try multiple shutdown methods as fallbacks
121+
nohup bash -c "
122+
sleep $${MAX_LIFETIME_MINUTES}m
123+
echo '[$$(date)] Maximum lifetime reached' 2>/dev/null || true
124+
# Try normal shutdown
125+
shutdown -h now 2>/dev/null || {
126+
# If shutdown fails, try halt
127+
halt -f 2>/dev/null || {
128+
# If halt fails, try sysrq if available (Linux only)
129+
if [ -w /proc/sysrq-trigger ]; then
130+
echo 1 > /proc/sys/kernel/sysrq 2>/dev/null
131+
echo o > /proc/sysrq-trigger 2>/dev/null
132+
fi
133+
# Last resort: force immediate reboot
134+
reboot -f 2>/dev/null || true
135+
}
136+
}
137+
" > /var/log/max-lifetime.log 2>&1 &
120138

121139
# Configure CloudWatch Logs if a log group is specified
122140
if [ "$cloudwatch_logs_group" != "" ]; then

0 commit comments

Comments
 (0)