diff --git a/mkdocs.yaml b/mkdocs.yaml index d4bb359f42..9b98e84a36 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -46,6 +46,11 @@ markdown_extensions: - admonition - pymdownx.details - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format nav: - Introduction: index.md diff --git a/modules/runners/README.md b/modules/runners/README.md index 397236881d..34ebb61694 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -18,6 +18,8 @@ The scale up lambda is triggered by events on a SQS queue. Events on this queue The scale down lambda is triggered via a CloudWatch event. The event is triggered by a cron expression defined in the variable `scale_down_schedule_expression` (https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html). For scaling down GitHub does not provide a good API yet, therefore we run the scaling down based on this event every x minutes. Each time the lambda is triggered it tries to remove all runners older than x minutes (configurable) managed in this deployment. In case the runner can be removed from GitHub, which means it is not executing a workflow, the lambda will terminate the EC2 instance. +--8<-- "modules/runners/scale-down-state-diagram.md:mkdocs_scale_down_state_diagram" + ## Lambda Function The Lambda function is written in [TypeScript](https://www.typescriptlang.org/) and requires Node 12.x and yarn. Sources are located in [./lambdas/runners]. Two lambda functions share the same sources, there is one entry point for `scaleDown` and another one for `scaleUp`. @@ -85,6 +87,7 @@ yarn run dist | [aws_iam_role.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.create_tag](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.describe_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.dist_bucket](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | diff --git a/modules/runners/policies-runner.tf b/modules/runners/policies-runner.tf index d1b9190930..d923c143cb 100644 --- a/modules/runners/policies-runner.tf +++ b/modules/runners/policies-runner.tf @@ -57,6 +57,12 @@ resource "aws_iam_role_policy" "describe_tags" { policy = file("${path.module}/policies/instance-describe-tags-policy.json") } +resource "aws_iam_role_policy" "create_tag" { + name = "runner-create-tags" + role = aws_iam_role.runner.name + policy = templatefile("${path.module}/policies/instance-create-tags-policy.json", {}) +} + resource "aws_iam_role_policy_attachment" "managed_policies" { count = length(var.runner_iam_role_managed_policy_arns) role = aws_iam_role.runner.name diff --git a/modules/runners/policies/instance-create-tags-policy.json b/modules/runners/policies/instance-create-tags-policy.json new file mode 100644 index 0000000000..9da09fcb70 --- /dev/null +++ b/modules/runners/policies/instance-create-tags-policy.json @@ -0,0 +1,20 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": "ec2:CreateTags", + "Condition": { + "ForAllValues:StringEquals": { + "aws:TagKeys": [ + "ghr:github_runner_id" + ] + }, + "StringEquals": { + "aws:ARN": "$${ec2:SourceInstanceARN}" + } + }, + "Effect": "Allow", + "Resource": "arn:*:ec2:*:*:instance/*" + } + ] +} diff --git a/modules/runners/scale-down-state-diagram.md b/modules/runners/scale-down-state-diagram.md new file mode 100644 index 0000000000..b4f260eb2a --- /dev/null +++ b/modules/runners/scale-down-state-diagram.md @@ -0,0 +1,150 @@ +# GitHub Actions Runner Scale-Down State Diagram + + + +The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It performs a two-phase cleanup process: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances. + +```mermaid +stateDiagram-v2 + [*] --> ScheduledExecution : Cron Trigger every 5 min + + ScheduledExecution --> Phase1_OrphanTermination : Start Phase 1 + + state Phase1_OrphanTermination { + [*] --> ListOrphanInstances : Query EC2 for ghr orphan true + + ListOrphanInstances --> CheckOrphanType : For each orphan + + state CheckOrphanType <> + CheckOrphanType --> HasRunnerIdTag : Has ghr github runner id + CheckOrphanType --> TerminateOrphan : No runner ID tag + + HasRunnerIdTag --> LastChanceCheck : Query GitHub API + + state LastChanceCheck <> + LastChanceCheck --> ConfirmedOrphan : Offline and busy + LastChanceCheck --> FalsePositive : Exists and not problematic + + ConfirmedOrphan --> TerminateOrphan + FalsePositive --> RemoveOrphanTag + + TerminateOrphan --> NextOrphan : Continue processing + RemoveOrphanTag --> NextOrphan + + NextOrphan --> CheckOrphanType : More orphans? + NextOrphan --> Phase2_ActiveRunners : All processed + } + + Phase1_OrphanTermination --> Phase2_ActiveRunners : Phase 1 Complete + + state Phase2_ActiveRunners { + [*] --> ListActiveRunners : Query non-orphan EC2 instances + + ListActiveRunners --> GroupByOwner : Sort by owner and repo + + GroupByOwner --> ProcessOwnerGroup : For each owner + + state ProcessOwnerGroup { + [*] --> SortByStrategy : Apply eviction strategy + SortByStrategy --> ProcessRunner : Oldest first or newest first + + ProcessRunner --> QueryGitHub : Get GitHub runners for owner + + QueryGitHub --> MatchRunner : Find runner by instance ID suffix + + state MatchRunner <> + MatchRunner --> FoundInGitHub : Runner exists in GitHub + MatchRunner --> NotFoundInGitHub : Runner not in GitHub + + state FoundInGitHub { + [*] --> CheckMinimumTime : Has minimum runtime passed? + + state CheckMinimumTime <> + CheckMinimumTime --> TooYoung : Runtime less than minimum + CheckMinimumTime --> CheckIdleQuota : Runtime greater than or equal to minimum + + TooYoung --> NextRunner + + state CheckIdleQuota <> + CheckIdleQuota --> KeepIdle : Idle quota available + CheckIdleQuota --> CheckBusyState : Quota full + + KeepIdle --> NextRunner + + state CheckBusyState <> + CheckBusyState --> KeepBusy : Runner busy + CheckBusyState --> TerminateIdle : Runner idle + + KeepBusy --> NextRunner + TerminateIdle --> DeregisterFromGitHub + DeregisterFromGitHub --> TerminateInstance + TerminateInstance --> NextRunner + } + + state NotFoundInGitHub { + [*] --> CheckBootTime : Has boot time exceeded? + + state CheckBootTime <> + CheckBootTime --> StillBooting : Boot time less than threshold + CheckBootTime --> MarkOrphan : Boot time greater than or equal to threshold + + StillBooting --> NextRunner + MarkOrphan --> TagAsOrphan : Set ghr orphan true + TagAsOrphan --> NextRunner + } + + NextRunner --> ProcessRunner : More runners in group? + NextRunner --> NextOwnerGroup : Group complete + } + + NextOwnerGroup --> ProcessOwnerGroup : More owner groups? + NextOwnerGroup --> ExecutionComplete : All groups processed + } + + Phase2_ActiveRunners --> ExecutionComplete : Phase 2 Complete + + ExecutionComplete --> [*] : Wait for next cron trigger + + note right of LastChanceCheck + Uses ghr github runner id tag + for precise GitHub API lookup + end note + + note right of MatchRunner + Matches GitHub runner name + ending with EC2 instance ID + end note + + note right of CheckMinimumTime + Minimum running time in minutes + (Linux: 5min, Windows: 15min) + end note + + note right of CheckBootTime + Runner boot time in minutes + Default configuration value + end note +``` + + + +## Key Decision Points + +| State | Condition | Action | +|-------|-----------|--------| +| **Orphan w/ Runner ID** | GitHub: offline + busy | Terminate (confirmed orphan) | +| **Orphan w/ Runner ID** | GitHub: exists + healthy | Remove orphan tag (false positive) | +| **Orphan w/o Runner ID** | Always | Terminate (no way to verify) | +| **Active Runner Found** | Runtime < minimum | Keep (too young) | +| **Active Runner Found** | Idle quota available | Keep as idle | +| **Active Runner Found** | Quota full + idle | Terminate + deregister | +| **Active Runner Found** | Quota full + busy | Keep running | +| **Active Runner Missing** | Boot time exceeded | Mark as orphan | +| **Active Runner Missing** | Still booting | Wait | + +## Configuration Parameters + +- **Cron Schedule**: `cron(*/5 * * * ? *)` (every 5 minutes) +- **Minimum Runtime**: Linux 5min, Windows 15min +- **Boot Timeout**: Configurable via `runner_boot_time_in_minutes` +- **Idle Config**: Per-environment configuration for desired idle runners diff --git a/modules/runners/templates/start-runner.ps1 b/modules/runners/templates/start-runner.ps1 index 1ced28dcba..ae2eeff3c9 100644 --- a/modules/runners/templates/start-runner.ps1 +++ b/modules/runners/templates/start-runner.ps1 @@ -1,6 +1,44 @@ ## Retrieve instance metadata +function Tag-InstanceWithRunnerId { + Write-Host "Checking for .runner file to extract agent ID" + + $runnerFilePath = "$pwd\.runner" + if (-not (Test-Path $runnerFilePath)) { + Write-Host "Warning: .runner file not found" + return $true + } + + Write-Host "Found .runner file, extracting agent ID" + try { + $runnerConfig = Get-Content $runnerFilePath | ConvertFrom-Json + $agentId = $runnerConfig.agentId + + if (-not $agentId -or $agentId -eq $null) { + Write-Host "Warning: Could not extract agent ID from .runner file" + return $true + } + + Write-Host "Tagging instance with GitHub runner agent ID: $agentId" + $tagResult = aws ec2 create-tags --region "$Region" --resources "$InstanceId" --tags "Key=ghr:github_runner_id,Value=$agentId" 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host "Successfully tagged instance with agent ID: $agentId" + return $true + } else { + Write-Host "Warning: Failed to tag instance with agent ID - $tagResult" + return $true + } + } + catch { + Write-Host "Warning: Error processing .runner file - $($_.Exception.Message)" + return $true + } +} + +## Retrieve instance metadata + Write-Host "Retrieving TOKEN from AWS API" $token=Invoke-RestMethod -Method PUT -Uri "http://169.254.169.254/latest/api/token" -Headers @{"X-aws-ec2-metadata-token-ttl-seconds" = "180"} if ( ! $token ) { @@ -122,6 +160,9 @@ if ($enable_jit_config -eq "false" -or $agent_mode -ne "ephemeral") { $configCmd = ".\config.cmd --unattended --name $runner_name_prefix$InstanceId --work `"_work`" $runnerExtraOptions $config" Write-Host "Configure GH Runner (non ephmeral / no JIT) as user $run_as" Invoke-Expression $configCmd + + # Tag instance with GitHub runner agent ID for non-JIT runners + Tag-InstanceWithRunnerId } $jsonBody = @( diff --git a/modules/runners/templates/start-runner.sh b/modules/runners/templates/start-runner.sh index 1c1f3d5e9f..7f2c0f82c5 100644 --- a/modules/runners/templates/start-runner.sh +++ b/modules/runners/templates/start-runner.sh @@ -58,6 +58,36 @@ create_xray_error_segment() { echo "$SEGMENT_DOC" } +tag_instance_with_runner_id() { + echo "Checking for .runner file to extract agent ID" + + if [[ ! -f "/opt/actions-runner/.runner" ]]; then + echo "Warning: .runner file not found" + return 0 + fi + + echo "Found .runner file, extracting agent ID" + local agent_id + agent_id=$(jq -r '.agentId' /opt/actions-runner/.runner 2>/dev/null || echo "") + + if [[ -z "$agent_id" || "$agent_id" == "null" ]]; then + echo "Warning: Could not extract agent ID from .runner file" + return 0 + fi + + echo "Tagging instance with GitHub runner agent ID: $agent_id" + if aws ec2 create-tags \ + --region "$region" \ + --resources "$instance_id" \ + --tags Key=ghr:github_runner_id,Value="$agent_id"; then + echo "Successfully tagged instance with agent ID: $agent_id" + return 0 + else + echo "Warning: Failed to tag instance with agent ID" + return 0 + fi +} + cleanup() { local exit_code="$1" local error_location="$2" @@ -225,6 +255,9 @@ if [[ "$enable_jit_config" == "false" || $agent_mode != "ephemeral" ]]; then extra_flags="" fi sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./config.sh $${extra_flags} --unattended --name "$runner_name_prefix$instance_id" --work "_work" $${config} + + # Tag instance with GitHub runner agent ID for non-JIT runners + tag_instance_with_runner_id fi create_xray_success_segment "$SEGMENT"