diff --git a/goss.yaml b/goss.yaml index 0d76ed40d..290e81018 100644 --- a/goss.yaml +++ b/goss.yaml @@ -160,12 +160,6 @@ command: /usr/local/bin/docker-gc: exit-status: 0 - systemctl is-enabled docker-low-disk-gc.timer: - exit-status: 0 - - /usr/local/bin/docker-low-disk-gc: - exit-status: 0 - # test that we can build a docker image docker buildx build -f tests/Dockerfile --progress=plain -t buildkite-postgres:latest tests: exit-status: 0 diff --git a/packer/linux/conf/bin/bk-configure-docker-gc.sh b/packer/linux/conf/bin/bk-configure-docker-gc.sh new file mode 100644 index 000000000..5a5ee2031 --- /dev/null +++ b/packer/linux/conf/bin/bk-configure-docker-gc.sh @@ -0,0 +1,174 @@ +#!/bin/bash +set -euo pipefail + +echo "Configuring docker cleanup" + +DOCKER_GC_SCHEDULE="${DOCKER_GC_SCHEDULE:-hourly}" +DOCKER_GC_PRUNE_UNTIL="${DOCKER_GC_PRUNE_UNTIL:-4h}" +DOCKER_GC_PRUNE_IMAGES="${DOCKER_GC_PRUNE_IMAGES:-false}" +DOCKER_GC_PRUNE_VOLUMES="${DOCKER_GC_PRUNE_VOLUMES:-false}" + +if ! [[ "$DOCKER_GC_PRUNE_UNTIL" =~ ^[0-9]+[smhd]$ ]]; then + echo "Warning: time format not expected: $DOCKER_GC_PRUNE_UNTIL" >&2 + echo "use format like 4h, 30m, 1d" >&2 +fi + +case "$DOCKER_GC_SCHEDULE" in +hourly | daily | weekly | monthly) ;; +*[0-9]*) ;; +*) + echo "Warning: time format not expected - $DOCKER_GC_SCHEDULE" >&2 + echo "use hourly, daily, weekly, monthly" >&2 + ;; +esac + +echo "Schedule: $DOCKER_GC_SCHEDULE" +echo "Prune older than: $DOCKER_GC_PRUNE_UNTIL" +echo "Cleaning all images: $DOCKER_GC_PRUNE_IMAGES" +echo "Volumes: $DOCKER_GC_PRUNE_VOLUMES" + +cat >/usr/local/bin/docker-gc <<'EOF' +#!/bin/bash +set -euo pipefail + +if [[ $EUID -eq 0 ]]; then + exec >> /var/log/elastic-stack.log 2>&1 +fi + +mark_instance_unhealthy() { + # cancel any running buildkite builds + killall -QUIT buildkite-agent || true + + # mark the instance for termination + echo "Marking instance as unhealthy" + + # shellcheck disable=SC2155 + local token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --fail --silent --show-error --location "http://169.254.169.254/latest/api/token") + # shellcheck disable=SC2155 + local instance_id=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/instance-id") + # shellcheck disable=SC2155 + local region=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/placement/region") + + aws autoscaling set-instance-health \ + --instance-id "${instance_id}" \ + --region "${region}" \ + --health-status Unhealthy +} + +trap mark_instance_unhealthy ERR + +echo "$(date): Docker cleanup starting" + +# Check if this is a disk-space triggered cleanup or scheduled cleanup +FORCE_CLEANUP=${1:-""} +if [[ "$FORCE_CLEANUP" != "force" ]]; then + if /usr/local/bin/bk-check-disk-space.sh >/dev/null 2>&1; then + echo "$(date): Disk space is sufficient, skipping Docker cleanup" + exit 0 + fi + echo "$(date): Disk space is low, proceeding with emergency Docker cleanup" + + TIME_FILTER="--filter until=DOCKER_PRUNE_UNTIL_PLACEHOLDER" + echo "Cleaning up docker resources older than DOCKER_PRUNE_UNTIL_PLACEHOLDER" + docker image prune --all --force $TIME_FILTER + docker builder prune --all --force $TIME_FILTER +else + echo "$(date): Running scheduled Docker cleanup" + + TIME_FILTER="--filter until=DOCKER_PRUNE_UNTIL_PLACEHOLDER" + + echo "Cleaning networks and containers" + docker network prune --force + docker container prune --force $TIME_FILTER + + if [[ "DOCKER_GC_PRUNE_IMAGES_PLACEHOLDER" == "true" ]]; then + echo "Cleaning all images" + docker image prune --all --force $TIME_FILTER + else + echo "Cleaning dangling images only" + docker image prune --force $TIME_FILTER + fi + + if [[ "DOCKER_GC_PRUNE_VOLUMES_PLACEHOLDER" == "true" ]]; then + echo "Cleaning volumes" + docker volume prune --force + fi +fi + +# After cleanup, verify we actually freed up space (but only if this was disk-triggered) +if [[ "$FORCE_CLEANUP" != "force" ]]; then + if ! /usr/local/bin/bk-check-disk-space.sh; then + echo "$(date): Disk health checks failed after Docker cleanup" >&2 + exit 1 + fi +fi + +echo "$(date): Docker cleanup completed successfully" +EOF + +sed -i "s/DOCKER_PRUNE_UNTIL_PLACEHOLDER/$DOCKER_GC_PRUNE_UNTIL/g" /usr/local/bin/docker-gc +sed -i "s/DOCKER_GC_PRUNE_IMAGES_PLACEHOLDER/$DOCKER_GC_PRUNE_IMAGES/g" /usr/local/bin/docker-gc +sed -i "s/DOCKER_GC_PRUNE_VOLUMES_PLACEHOLDER/$DOCKER_GC_PRUNE_VOLUMES/g" /usr/local/bin/docker-gc + +chmod +x /usr/local/bin/docker-gc + +cat >/etc/systemd/system/docker-gc.timer </etc/systemd/system/docker-gc.service <&2 + exit 1 + fi echo "Checking disk space again" # Capture disk space output for potential error logging diff --git a/packer/linux/conf/docker/scripts/docker-gc b/packer/linux/conf/docker/scripts/docker-gc deleted file mode 100755 index edd8d6943..000000000 --- a/packer/linux/conf/docker/scripts/docker-gc +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -euo pipefail - -if [[ $EUID -eq 0 ]]; then - exec >>/var/log/elastic-stack.log 2>&1 # Logs to elastic-stack.log -fi - -DOCKER_PRUNE_UNTIL=${DOCKER_PRUNE_UNTIL:-4h} - -## ------------------------------------------ -## Prune stuff that doesn't affect cache hits - -docker network prune --force --filter "until=${DOCKER_PRUNE_UNTIL}" -docker container prune --force --filter "until=${DOCKER_PRUNE_UNTIL}" diff --git a/packer/linux/conf/docker/scripts/docker-low-disk-gc b/packer/linux/conf/docker/scripts/docker-low-disk-gc deleted file mode 100755 index 3f74f73c4..000000000 --- a/packer/linux/conf/docker/scripts/docker-low-disk-gc +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -set -euo pipefail - -if [[ $EUID -eq 0 ]]; then - exec >>/var/log/elastic-stack.log 2>&1 # Logs to elastic-stack.log -fi - -DOCKER_PRUNE_UNTIL=${DOCKER_PRUNE_UNTIL:-1h} - -mark_instance_unhealthy() { - # cancel any running buildkite builds - killall -QUIT buildkite-agent || true - - # mark the instance for termination - echo "Marking instance as unhealthy" - - # shellcheck disable=SC2155 - local token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --fail --silent --show-error --location "http://169.254.169.254/latest/api/token") - # shellcheck disable=SC2155 - local instance_id=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/instance-id") - # shellcheck disable=SC2155 - local region=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/placement/region") - - aws autoscaling set-instance-health \ - --instance-id "${instance_id}" \ - --region "${region}" \ - --health-status Unhealthy -} - -trap mark_instance_unhealthy ERR - -## ----------------------------------------------------------------- -## Check disk, we only want to prune images/containers/build caches -## if we really need to - -if ! /usr/local/bin/bk-check-disk-space.sh; then - echo "Cleaning up docker resources older than ${DOCKER_PRUNE_UNTIL}" - docker image prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL}" - docker builder prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL}" - - if ! /usr/local/bin/bk-check-disk-space.sh; then - echo "Disk health checks failed" >&2 && false - exit 1 - fi -fi diff --git a/packer/linux/conf/docker/systemd/docker-gc.service b/packer/linux/conf/docker/systemd/docker-gc.service deleted file mode 100644 index be0cffa2b..000000000 --- a/packer/linux/conf/docker/systemd/docker-gc.service +++ /dev/null @@ -1,7 +0,0 @@ -[Unit] -Description=Clean files used by docker except for the build cache -Wants=docker-gc.timer - -[Service] -Type=oneshot -ExecStart=/usr/local/bin/docker-gc diff --git a/packer/linux/conf/docker/systemd/docker-gc.timer b/packer/linux/conf/docker/systemd/docker-gc.timer deleted file mode 100644 index 93886059d..000000000 --- a/packer/linux/conf/docker/systemd/docker-gc.timer +++ /dev/null @@ -1,10 +0,0 @@ -[Unit] -Description=Clean files used by docker except for the build cache -Requires=docker-gc.service - -[Timer] -Unit=docker-gc.service -OnCalendar=hourly - -[Install] -WantedBy=timers.target diff --git a/packer/linux/conf/docker/systemd/docker-low-disk-gc.service b/packer/linux/conf/docker/systemd/docker-low-disk-gc.service deleted file mode 100644 index 4e19e751f..000000000 --- a/packer/linux/conf/docker/systemd/docker-low-disk-gc.service +++ /dev/null @@ -1,7 +0,0 @@ -[Unit] -Description=Clean files used by docker including the build cache when disk space is low -Wants=docker-low-disk-gc.timer - -[Service] -Type=oneshot -ExecStart=/usr/local/bin/docker-low-disk-gc diff --git a/packer/linux/conf/docker/systemd/docker-low-disk-gc.timer b/packer/linux/conf/docker/systemd/docker-low-disk-gc.timer deleted file mode 100644 index 900234413..000000000 --- a/packer/linux/conf/docker/systemd/docker-low-disk-gc.timer +++ /dev/null @@ -1,10 +0,0 @@ -[Unit] -Description=Clean files used by docker including the build cache when disk space is low -Requires=docker-low-disk-gc.service - -[Timer] -Unit=docker-low-disk-gc.service -OnCalendar=hourly - -[Install] -WantedBy=timers.target diff --git a/packer/linux/scripts/install-docker.sh b/packer/linux/scripts/install-docker.sh index e182647da..39b70cf91 100755 --- a/packer/linux/scripts/install-docker.sh +++ b/packer/linux/scripts/install-docker.sh @@ -17,9 +17,11 @@ sudo mkdir -p /etc/docker sudo cp /tmp/conf/docker/daemon.json /etc/docker/daemon.json echo "Adding docker systemd timers..." -sudo cp /tmp/conf/docker/scripts/* /usr/local/bin +if ls /tmp/conf/docker/scripts/* >/dev/null 2>&1; then + sudo cp /tmp/conf/docker/scripts/* /usr/local/bin + sudo chmod +x /usr/local/bin/docker-* +fi sudo cp /tmp/conf/docker/systemd/docker-* /etc/systemd/system -sudo chmod +x /usr/local/bin/docker-* echo "Installing docker buildx..." DOCKER_CLI_DIR=/usr/libexec/docker/cli-plugins diff --git a/packer/windows/conf/bin/bk-configure-docker-gc.ps1 b/packer/windows/conf/bin/bk-configure-docker-gc.ps1 new file mode 100644 index 000000000..170b1f3b8 --- /dev/null +++ b/packer/windows/conf/bin/bk-configure-docker-gc.ps1 @@ -0,0 +1,112 @@ +$ErrorActionPreference = "Stop" + +Write-Output "Configuring docker cleanup" + +$DockerGcSchedule = if ($env:DOCKER_GC_SCHEDULE) { $env:DOCKER_GC_SCHEDULE } else { "hourly" } +$DockerGcPruneUntil = if ($env:DOCKER_GC_PRUNE_UNTIL) { $env:DOCKER_GC_PRUNE_UNTIL } else { "4h" } +$DockerGcPruneImages = if ($env:DOCKER_GC_PRUNE_IMAGES) { $env:DOCKER_GC_PRUNE_IMAGES } else { "false" } +$DockerGcPruneVolumes = if ($env:DOCKER_GC_PRUNE_VOLUMES) { $env:DOCKER_GC_PRUNE_VOLUMES } else { "false" } + +if ($DockerGcPruneUntil -notmatch '^[0-9]+[smhd]$') { + Write-Warning "time format not expected: $DockerGcPruneUntil" + Write-Warning "use format like 4h, 30m, 1d" +} + +switch ($DockerGcSchedule) { + { $_ -in @("hourly", "daily", "weekly", "monthly") } { break } + { $_ -match '[0-9]+' } { break } + default { + Write-Warning "time format not expected - $DockerGcSchedule" + Write-Warning "use hourly, daily, weekly, monthly" + } +} + +Write-Output "Schedule: $DockerGcSchedule" +Write-Output "Prune older than: $DockerGcPruneUntil" +Write-Output "Cleaning all images: $DockerGcPruneImages" +Write-Output "Volumes: $DockerGcPruneVolumes" + +$dockerGcScript = @" +# Stop script execution when a non-terminating error occurs +`$ErrorActionPreference = "Stop" + +# Log to the main log file +`$logFile = "C:\buildkite-agent\elastic-stack.log" +Add-Content -Path `$logFile -Value "`$(Get-Date): Docker cleanup starting" + +`$TimeFilter = "--filter until=$DockerGcPruneUntil" + +Add-Content -Path `$logFile -Value "Cleaning networks and containers" +docker network prune --force +docker container prune --force `$TimeFilter + +if ("$DockerGcPruneImages" -eq "true") { + Add-Content -Path `$logFile -Value "Cleaning all images" + docker image prune --all --force `$TimeFilter +} else { + Add-Content -Path `$logFile -Value "Cleaning dangling images only" + docker image prune --force `$TimeFilter +} + +if ("$DockerGcPruneVolumes" -eq "true") { + Add-Content -Path `$logFile -Value "Cleaning volumes" + docker volume prune --force +} + +Add-Content -Path `$logFile -Value "`$(Get-Date): Docker cleanup completed" +"@ + +$dockerGcScript | Out-File -FilePath "C:\buildkite-agent\bin\docker-gc.ps1" -Encoding UTF8 + +$taskName = "DockerGC" + +$trigger = switch ($DockerGcSchedule) { + "hourly" { New-ScheduledTaskTrigger -Once -At (Get-Date) -RepetitionInterval (New-TimeSpan -Hours 1) -RepetitionDuration (New-TimeSpan -Days 365) } + "daily" { New-ScheduledTaskTrigger -Daily -At "2:00AM" } + "weekly" { New-ScheduledTaskTrigger -Weekly -At "2:00AM" -DaysOfWeek Sunday } + "monthly" { New-ScheduledTaskTrigger -Weekly -At "2:00AM" -WeeksInterval 4 -DaysOfWeek Sunday } + default { New-ScheduledTaskTrigger -Once -At (Get-Date) -RepetitionInterval (New-TimeSpan -Hours 1) -RepetitionDuration (New-TimeSpan -Days 365) } +} + +$action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-ExecutionPolicy Bypass -File C:\buildkite-agent\bin\docker-gc.ps1 >> C:\buildkite-agent\elastic-stack.log 2>&1" +$settings = New-ScheduledTaskSettingsSet -AllowStartIfOnBatteries -DontStopIfGoingOnBatteries -StartWhenAvailable + +Write-Output "creating scheduled task" +try { + Register-ScheduledTask -TaskName $taskName -Trigger $trigger -Action $action -Settings $settings -Force | Out-Null + Write-Output "scheduled task created successfully" +} catch { + Write-Warning "failed to create scheduled task: $_" + Write-Warning "retrying in 5 seconds..." + Start-Sleep -Seconds 5 + try { + Register-ScheduledTask -TaskName $taskName -Trigger $trigger -Action $action -Settings $settings -Force | Out-Null + Write-Output "scheduled task created successfully on retry" + } catch { + Write-Warning "failed to create scheduled task twice, skipping timer setup" + return + } +} + +Write-Output "Docker GC Cleanup configured" +Write-Output "Schedule: $DockerGcSchedule" +Write-Output "Prune older than: $DockerGcPruneUntil" +if ($DockerGcPruneImages -eq "true") { + Write-Output "Will clean all images" +} else { + Write-Output "Will clean dangling images only" +} +if ($DockerGcPruneVolumes -eq "true") { + Write-Output "Will clean volumes" +} else { + Write-Output "Volumes left alone" +} + +Write-Output "Restarting Docker service..." +try { + Restart-Service docker + Write-Output "Docker service restarted successfully" +} catch { + Write-Warning "Failed to restart Docker service: $_" + Write-Warning "Continuing without Docker restart..." +} diff --git a/packer/windows/conf/bin/bk-install-elastic-stack.ps1 b/packer/windows/conf/bin/bk-install-elastic-stack.ps1 index cbc75366b..d6850945e 100644 --- a/packer/windows/conf/bin/bk-install-elastic-stack.ps1 +++ b/packer/windows/conf/bin/bk-install-elastic-stack.ps1 @@ -82,6 +82,10 @@ set_always "BUILDKITE_SECRETS_BUCKET_REGION" "$Env:BUILDKITE_SECRETS_BUC set_always "BUILDKITE_STACK_NAME" "$Env:BUILDKITE_STACK_NAME" set_always "BUILDKITE_STACK_VERSION" "$Env:BUILDKITE_STACK_VERSION" set_always "BUILDKITE_DOCKER_EXPERIMENTAL" "$DOCKER_EXPERIMENTAL" +set_always "DOCKER_GC_SCHEDULE" "$Env:DOCKER_GC_SCHEDULE" +set_always "DOCKER_GC_PRUNE_UNTIL" "$Env:DOCKER_GC_PRUNE_UNTIL" +set_always "DOCKER_GC_PRUNE_IMAGES" "$Env:DOCKER_GC_PRUNE_IMAGES" +set_always "DOCKER_GC_PRUNE_VOLUMES" "$Env:DOCKER_GC_PRUNE_VOLUMES" set_always "DOCKER_VERSION" "$DOCKER_VERSION" set_always "PLUGINS_ENABLED" "$PLUGINS_ENABLED" set_always "BUILDKITE_ARTIFACTS_BUCKET" "$Env:BUILDKITE_ARTIFACTS_BUCKET" @@ -218,6 +222,14 @@ if ($docker_ready) { Write-Output "Docker is ready." # Optionally run docker ps again to show output if needed, but the check already passed # docker ps + + Write-Output "Configuring Docker garbage collection..." + try { + & "C:\buildkite-agent\bin\bk-configure-docker-gc.ps1" + } catch { + Write-Warning "Failed to configure Docker GC: $($_.Exception.Message)" + Write-Warning "Continuing without Docker GC configuration..." + } } else { Write-Output "Failed to confirm Docker readiness after $max_wait_time seconds." # Add more diagnostics if possible diff --git a/templates/aws-stack.yml b/templates/aws-stack.yml index c4f1861ad..ea49e22f8 100644 --- a/templates/aws-stack.yml +++ b/templates/aws-stack.yml @@ -149,6 +149,12 @@ Metadata: Parameters: - EnableDockerUserNamespaceRemap - EnableDockerExperimental + - DockerGCSchedule + - DockerGCPruneUntil + - DockerGCPruneImages + - DockerGCPruneVolumes + - DiskMinAvailable + - DiskMinInodes - Label: default: Docker Registry Configuration @@ -803,6 +809,48 @@ Parameters: - "false" Default: "false" + DockerGCSchedule: + Description: Systemd timer schedule for docker garbage collection (default is hourly) + Type: String + Default: "hourly" + AllowedPattern: "^(minutely|hourly|daily|weekly|monthly|yearly|\\*-\\*-\\* \\d{1,2}:\\d{2}:\\d{2}|\\d{1,2} \\d{1,2} \\* \\* \\*)$" + ConstraintDescription: Must be a valid systemd timer schedule (minutely, hourly, daily, weekly, monthly, yearly, or cron format like "0 3 * * *") + + DockerGCPruneUntil: + Description: Remove docker resources older than this duration (default is 4h) + Type: String + Default: "4h" + AllowedPattern: "^\\d+[smhdw]$" + ConstraintDescription: Must be a valid duration format (e.g., 30s, 5m, 2h, 1d, 1w) + + DockerGCPruneImages: + Description: Set to "true" to prune all unused images, not just dangling ones + Type: String + AllowedValues: + - "true" + - "false" + Default: "false" + + DockerGCPruneVolumes: + Description: Set to "true" to prune anonymous volumes during garbage collection + Type: String + AllowedValues: + - "true" + - "false" + Default: "false" + + DiskMinAvailable: + Description: Minimum disk space (in KB) before triggering emergency Docker cleanup (default is 5GB) + Type: Number + Default: 5242880 + MinValue: 524288 + + DiskMinInodes: + Description: Minimum free inodes before triggering emergency Docker cleanup + Type: Number + Default: 250000 + MinValue: 10000 + EnableInstanceStorage: Type: String Description: > @@ -1764,6 +1812,10 @@ Resources: $Env:DOCKER_USERNS_REMAP="${EnableDockerUserNamespaceRemap}" $Env:DOCKER_EXPERIMENTAL="${EnableDockerExperimental}" $Env:DOCKER_NETWORKING_PROTOCOL="${DockerNetworkingProtocol}" + $Env:DOCKER_GC_SCHEDULE="${DockerGCSchedule}" + $Env:DOCKER_GC_PRUNE_UNTIL="${DockerGCPruneUntil}" + $Env:DOCKER_GC_PRUNE_IMAGES="${DockerGCPruneImages}" + $Env:DOCKER_GC_PRUNE_VOLUMES="${DockerGCPruneVolumes}" powershell -file C:\buildkite-agent\bin\bk-configure-docker.ps1 >> C:\buildkite-agent\elastic-stack.log $Env:BUILDKITE_STACK_NAME="${AWS::StackName}" @@ -1893,6 +1945,12 @@ Resources: DOCKER_LOGIN_PLUGIN_ENABLED="${EnableDockerLoginPlugin}" \ DOCKER_EXPERIMENTAL="${EnableDockerExperimental}" \ DOCKER_USERNS_REMAP=${EnableDockerUserNamespaceRemap} \ + DOCKER_GC_SCHEDULE="${DockerGCSchedule}" \ + DOCKER_GC_PRUNE_UNTIL="${DockerGCPruneUntil}" \ + DOCKER_GC_PRUNE_IMAGES="${DockerGCPruneImages}" \ + DOCKER_GC_PRUNE_VOLUMES="${DockerGCPruneVolumes}" \ + DISK_MIN_AVAILABLE="${DiskMinAvailable}" \ + DISK_MIN_INODES="${DiskMinInodes}" \ AWS_REGION="${AWS::Region}" \ ENABLE_RESOURCE_LIMITS="${ExperimentalEnableResourceLimits}" \ RESOURCE_LIMITS_MEMORY_HIGH="${ResourceLimitsMemoryHigh}" \ @@ -1904,6 +1962,7 @@ Resources: ENABLE_EC2_LOG_RETENTION_POLICY="${EnableEC2LogRetentionPolicy}" \ EC2_LOG_RETENTION_DAYS="${EC2LogRetentionDays}" \ /usr/local/bin/bk-install-elastic-stack.sh + --==BOUNDARY==-- - LocalSecretsBucket: !If - CreateSecretsBucket