Skip to content
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions goss.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,6 @@ command:
/usr/local/bin/docker-gc:
exit-status: 0

systemctl is-enabled docker-low-disk-gc.timer:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add those back?

exit-status: 0

/usr/local/bin/docker-low-disk-gc:
exit-status: 0

# test that we can build a docker image
docker buildx build -f tests/Dockerfile --progress=plain -t buildkite-postgres:latest tests:
exit-status: 0
Expand Down
174 changes: 174 additions & 0 deletions packer/linux/conf/bin/bk-configure-docker-gc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/bin/bash
set -euo pipefail

echo "Configuring docker cleanup"

DOCKER_GC_SCHEDULE="${DOCKER_GC_SCHEDULE:-hourly}"
DOCKER_GC_PRUNE_UNTIL="${DOCKER_GC_PRUNE_UNTIL:-4h}"
DOCKER_GC_PRUNE_IMAGES="${DOCKER_GC_PRUNE_IMAGES:-false}"
DOCKER_GC_PRUNE_VOLUMES="${DOCKER_GC_PRUNE_VOLUMES:-false}"

if ! [[ "$DOCKER_GC_PRUNE_UNTIL" =~ ^[0-9]+[smhd]$ ]]; then
echo "Warning: time format not expected: $DOCKER_GC_PRUNE_UNTIL" >&2
echo "use format like 4h, 30m, 1d" >&2
fi

case "$DOCKER_GC_SCHEDULE" in
hourly | daily | weekly | monthly) ;;
*[0-9]*) ;;
*)
echo "Warning: time format not expected - $DOCKER_GC_SCHEDULE" >&2
echo "use hourly, daily, weekly, monthly" >&2
;;
esac

echo "Schedule: $DOCKER_GC_SCHEDULE"
echo "Prune older than: $DOCKER_GC_PRUNE_UNTIL"
echo "Cleaning all images: $DOCKER_GC_PRUNE_IMAGES"
echo "Volumes: $DOCKER_GC_PRUNE_VOLUMES"

cat >/usr/local/bin/docker-gc <<'EOF'
#!/bin/bash
set -euo pipefail
if [[ $EUID -eq 0 ]]; then
exec >> /var/log/elastic-stack.log 2>&1
fi
mark_instance_unhealthy() {
# cancel any running buildkite builds
killall -QUIT buildkite-agent || true
# mark the instance for termination
echo "Marking instance as unhealthy"
# shellcheck disable=SC2155
local token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --fail --silent --show-error --location "http://169.254.169.254/latest/api/token")
# shellcheck disable=SC2155
local instance_id=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/instance-id")
# shellcheck disable=SC2155
local region=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/placement/region")
aws autoscaling set-instance-health \
--instance-id "${instance_id}" \
--region "${region}" \
--health-status Unhealthy
}
trap mark_instance_unhealthy ERR
echo "$(date): Docker cleanup starting"
# Check if this is a disk-space triggered cleanup or scheduled cleanup
FORCE_CLEANUP=${1:-""}
if [[ "$FORCE_CLEANUP" != "force" ]]; then
if /usr/local/bin/bk-check-disk-space.sh >/dev/null 2>&1; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't disk space already checked?

echo "$(date): Disk space is sufficient, skipping Docker cleanup"
exit 0
fi
echo "$(date): Disk space is low, proceeding with emergency Docker cleanup"
TIME_FILTER="--filter until=DOCKER_PRUNE_UNTIL_PLACEHOLDER"
echo "Cleaning up docker resources older than DOCKER_PRUNE_UNTIL_PLACEHOLDER"
docker image prune --all --force $TIME_FILTER
docker builder prune --all --force $TIME_FILTER
else
echo "$(date): Running scheduled Docker cleanup"
TIME_FILTER="--filter until=DOCKER_PRUNE_UNTIL_PLACEHOLDER"
echo "Cleaning networks and containers"
docker network prune --force
docker container prune --force $TIME_FILTER
if [[ "DOCKER_GC_PRUNE_IMAGES_PLACEHOLDER" == "true" ]]; then
echo "Cleaning all images"
docker image prune --all --force $TIME_FILTER
else
echo "Cleaning dangling images only"
docker image prune --force $TIME_FILTER
fi
if [[ "DOCKER_GC_PRUNE_VOLUMES_PLACEHOLDER" == "true" ]]; then
echo "Cleaning volumes"
docker volume prune --force
fi
fi
# After cleanup, verify we actually freed up space (but only if this was disk-triggered)
if [[ "$FORCE_CLEANUP" != "force" ]]; then
if ! /usr/local/bin/bk-check-disk-space.sh; then
echo "$(date): Disk health checks failed after Docker cleanup" >&2
exit 1
fi
fi
echo "$(date): Docker cleanup completed successfully"
EOF

sed -i "s/DOCKER_PRUNE_UNTIL_PLACEHOLDER/$DOCKER_GC_PRUNE_UNTIL/g" /usr/local/bin/docker-gc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
sed -i "s/DOCKER_PRUNE_UNTIL_PLACEHOLDER/$DOCKER_GC_PRUNE_UNTIL/g" /usr/local/bin/docker-gc
DOCKER_GC_PRUNE_UNTIL="${DOCKER_GC_PRUNE_UNTIL:-4h}"
DOCKER_GC_PRUNE_IMAGES="${DOCKER_GC_PRUNE_IMAGES:-false}"
DOCKER_GC_PRUNE_VOLUMES="${DOCKER_GC_PRUNE_VOLUMES:-false}"

Do we need to sed those variables? Can we refer to vars instead?

sed -i "s/DOCKER_GC_PRUNE_IMAGES_PLACEHOLDER/$DOCKER_GC_PRUNE_IMAGES/g" /usr/local/bin/docker-gc
sed -i "s/DOCKER_GC_PRUNE_VOLUMES_PLACEHOLDER/$DOCKER_GC_PRUNE_VOLUMES/g" /usr/local/bin/docker-gc

chmod +x /usr/local/bin/docker-gc

cat >/etc/systemd/system/docker-gc.timer <<EOF
[Unit]
Description=Docker GC Cleanup Timer
Requires=docker-gc.service
[Timer]
Unit=docker-gc.service
OnCalendar=${DOCKER_GC_SCHEDULE}
Persistent=true
[Install]
WantedBy=timers.target
EOF

cat >/etc/systemd/system/docker-gc.service <<EOF
[Unit]
Description=Docker GC Cleanup
Wants=docker-gc.timer
[Service]
Type=oneshot
ExecStart=/usr/local/bin/docker-gc force
StandardOutput=journal
StandardError=journal
EOF

echo "Enabling timer"
systemctl daemon-reload || {
echo "Warning: systemctl daemon-reload failed, retrying in 5 seconds"
sleep 5
systemctl daemon-reload || {
echo "Error: systemctl daemon-reload failed twice, skipping timer setup"
exit 0
}
}

systemctl enable docker-gc.timer || {
echo "Warning: failed to enable docker-gc.timer"
}

systemctl start docker-gc.timer || {
echo "Warning: failed to start docker-gc.timer, will retry later"
}

echo "Docker GC Cleanup configured"
echo "Schedule: $DOCKER_GC_SCHEDULE"
echo "Prune older than: $DOCKER_GC_PRUNE_UNTIL"
if [[ "$DOCKER_GC_PRUNE_IMAGES" == "true" ]]; then
echo "Will clean all images"
else
echo "Will clean dangling images only"
fi
if [[ "$DOCKER_GC_PRUNE_VOLUMES" == "true" ]]; then
echo "Will clean volumes"
else
echo "Volumes left alone"
fi

echo Restarting docker daemon...
systemctl restart docker
6 changes: 0 additions & 6 deletions packer/linux/conf/bin/bk-configure-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,5 @@ else
echo Instance storage not configured.
fi

echo Cleaning up docker images...
systemctl start docker-low-disk-gc.service

echo Enabling docker-gc timers...
systemctl enable docker-gc.timer docker-low-disk-gc.timer

echo Restarting docker daemon...
systemctl restart docker
17 changes: 17 additions & 0 deletions packer/linux/conf/bin/bk-install-elastic-stack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ set_always "BUILDKITE_STACK_NAME" "$BUILDKITE_STACK_NAME"
set_always "BUILDKITE_STACK_VERSION" "$BUILDKITE_STACK_VERSION"
set_always "BUILDKITE_DOCKER_EXPERIMENTAL" "$DOCKER_EXPERIMENTAL"
set_always "DOCKER_USERNS_REMAP" "$DOCKER_USERNS_REMAP"
set_always "DOCKER_GC_SCHEDULE" "$DOCKER_GC_SCHEDULE"
set_always "DOCKER_GC_PRUNE_UNTIL" "$DOCKER_GC_PRUNE_UNTIL"
set_always "DOCKER_GC_PRUNE_IMAGES" "$DOCKER_GC_PRUNE_IMAGES"
set_always "DOCKER_GC_PRUNE_VOLUMES" "$DOCKER_GC_PRUNE_VOLUMES"
set_always "DISK_MIN_AVAILABLE" "$DISK_MIN_AVAILABLE"
set_always "DISK_MIN_INODES" "$DISK_MIN_INODES"
set_always "DOCKER_VERSION" "$DOCKER_VERSION"
set_always "PLUGINS_ENABLED" "${PLUGINS_ENABLED[*]-}"
set_always "BUILDKITE_ARTIFACTS_BUCKET" "$BUILDKITE_ARTIFACTS_BUCKET"
Expand Down Expand Up @@ -418,6 +424,17 @@ fi
echo "Waited $next_wait_time times for docker to start. We will exit if it still has not started."
check_docker

echo "Configuring Docker garbage collection..."
if [[ -f "/usr/local/bin/bk-configure-docker-gc.sh" ]]; then
if /usr/local/bin/bk-configure-docker-gc.sh; then
echo "Docker GC configuration completed successfully"
else
echo "Warning: Docker GC configuration failed, continuing with installation..."
fi
else
echo "Warning: bk-configure-docker-gc.sh not found, skipping Docker GC configuration"
fi

echo Writing buildkite-agent systemd environment override...
# also set in /var/lib/buildkite-agent/cfn-env so that it's shown in the job logs
mkdir -p /etc/systemd/system/buildkite-agent.service.d
Expand Down
7 changes: 5 additions & 2 deletions packer/linux/conf/buildkite-agent/hooks/environment
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ fi

echo "Checking disk space"
if ! /usr/local/bin/bk-check-disk-space.sh; then
echo "Cleaning up docker resources older than ${DOCKER_PRUNE_UNTIL:-4h}"
docker image prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL:-4h}"
echo "Disk space low, triggering emergency Docker cleanup"
if ! /usr/local/bin/docker-gc; then
echo "Emergency Docker cleanup failed" >&2
exit 1
fi

echo "Checking disk space again"
# Capture disk space output for potential error logging
Expand Down
14 changes: 0 additions & 14 deletions packer/linux/conf/docker/scripts/docker-gc

This file was deleted.

45 changes: 0 additions & 45 deletions packer/linux/conf/docker/scripts/docker-low-disk-gc

This file was deleted.

7 changes: 0 additions & 7 deletions packer/linux/conf/docker/systemd/docker-gc.service

This file was deleted.

10 changes: 0 additions & 10 deletions packer/linux/conf/docker/systemd/docker-gc.timer

This file was deleted.

7 changes: 0 additions & 7 deletions packer/linux/conf/docker/systemd/docker-low-disk-gc.service

This file was deleted.

10 changes: 0 additions & 10 deletions packer/linux/conf/docker/systemd/docker-low-disk-gc.timer

This file was deleted.

6 changes: 4 additions & 2 deletions packer/linux/scripts/install-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ sudo mkdir -p /etc/docker
sudo cp /tmp/conf/docker/daemon.json /etc/docker/daemon.json

echo "Adding docker systemd timers..."
sudo cp /tmp/conf/docker/scripts/* /usr/local/bin
if ls /tmp/conf/docker/scripts/* >/dev/null 2>&1; then
sudo cp /tmp/conf/docker/scripts/* /usr/local/bin
sudo chmod +x /usr/local/bin/docker-*
fi
sudo cp /tmp/conf/docker/systemd/docker-* /etc/systemd/system
sudo chmod +x /usr/local/bin/docker-*

echo "Installing docker buildx..."
DOCKER_CLI_DIR=/usr/libexec/docker/cli-plugins
Expand Down
Loading