diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 3a1b31b1cf..72ff9e3e1a 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -10,6 +10,7 @@ export interface RunnerList { repo?: string; org?: string; orphan?: boolean; + runnerId?: string; } export interface RunnerInfo { diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index b927f98696..c4ec328c9b 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -4,6 +4,7 @@ import { CreateFleetInstance, CreateFleetResult, CreateTagsCommand, + DeleteTagsCommand, DefaultTargetCapacityType, DescribeInstancesCommand, DescribeInstancesResult, @@ -17,7 +18,7 @@ import { mockClient } from 'aws-sdk-client-mock'; import 'aws-sdk-client-mock-jest/vitest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, tag, terminateRunner } from './runners'; +import { createRunner, listEC2Runners, tag, untag, terminateRunner } from './runners'; import { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -53,6 +54,26 @@ const mockRunningInstances: DescribeInstancesResult = { }, ], }; +const mockRunningInstancesJit: DescribeInstancesResult = { + Reservations: [ + { + Instances: [ + { + LaunchTime: new Date('2020-10-10T14:48:00.000+09:00'), + InstanceId: 'i-1234', + Tags: [ + { Key: 'ghr:Application', Value: 'github-action-runner' }, + { Key: 'ghr:runner_name_prefix', Value: RUNNER_NAME_PREFIX }, + { Key: 'ghr:created_by', Value: 'scale-up-lambda' }, + { Key: 'ghr:Type', Value: 'Org' }, + { Key: 'ghr:Owner', Value: 'CoderToCat' }, + { Key: 'ghr:github_runner_id', Value: '9876543210' }, + ], + }, + ], + }, + ], +}; describe('list instances', () => { beforeEach(() => { @@ -60,7 +81,7 @@ describe('list instances', () => { vi.clearAllMocks(); }); - it('returns a list of instances', async () => { + it('returns a list of instances (Non JIT)', async () => { mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances); const resp = await listEC2Runners(); expect(resp.length).toBe(1); @@ -73,6 +94,20 @@ describe('list instances', () => { }); }); + it('returns a list of instances (JIT)', async () => { + mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstancesJit); + const resp = await listEC2Runners(); + expect(resp.length).toBe(1); + expect(resp).toContainEqual({ + instanceId: 'i-1234', + launchTime: new Date('2020-10-10T14:48:00.000+09:00'), + type: 'Org', + owner: 'CoderToCat', + orphan: false, + runnerId: '9876543210', + }); + }); + it('check orphan tag.', async () => { const instances: DescribeInstancesResult = mockRunningInstances; instances.Reservations![0].Instances![0].Tags!.push({ Key: 'ghr:orphan', Value: 'true' }); @@ -229,11 +264,35 @@ describe('tag runner', () => { owner: 'owner-2', type: 'Repo', }; - await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'truer' }]); + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + + expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], + }); + }); +}); +describe('untag runner', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + it('removing extra tag', async () => { + mockEC2Client.on(DeleteTagsCommand).resolves({}); + const runner: RunnerInfo = { + instanceId: 'instance-2', + owner: 'owner-2', + type: 'Repo', + }; + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { Resources: [runner.instanceId], - Tags: [{ Key: 'ghr:orphan', Value: 'truer' }], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], + }); + await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + expect(mockEC2Client).toHaveReceivedCommandWith(DeleteTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], }); }); }); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index dfe4d99fcf..6779dd39d2 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -2,6 +2,7 @@ import { CreateFleetCommand, CreateFleetResult, CreateTagsCommand, + DeleteTagsCommand, DescribeInstancesCommand, DescribeInstancesResult, EC2Client, @@ -91,6 +92,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', + runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string, }); } } @@ -112,6 +114,12 @@ export async function tag(instanceId: string, tags: Tag[]): Promise { await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); } +export async function untag(instanceId: string, tags: Tag[]): Promise { + logger.debug(`Untagging '${instanceId}'`, { tags }); + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + await ec2.send(new DeleteTagsCommand({ Resources: [instanceId], Tags: tags })); +} + function generateFleetOverrides( subnetIds: string[], instancesTypes: string[], diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 87bab093cb..8dd25323a6 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -4,7 +4,7 @@ import nock from 'nock'; import { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../github/auth'; -import { listEC2Runners, terminateRunner, tag } from './../aws/runners'; +import { listEC2Runners, terminateRunner, tag, untag } from './../aws/runners'; import { githubCache } from './cache'; import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -33,6 +33,7 @@ vi.mock('./../aws/runners', async (importOriginal) => { return { ...actual, tag: vi.fn(), + untag: vi.fn(), terminateRunner: vi.fn(), listEC2Runners: vi.fn(), }; @@ -62,6 +63,7 @@ const mockedInstallationAuth = vi.mocked(ghAuth.createGithubInstallationAuth); const mockCreateClient = vi.mocked(ghAuth.createOctokitClient); const mockListRunners = vi.mocked(listEC2Runners); const mockTagRunners = vi.mocked(tag); +const mockUntagRunners = vi.mocked(untag); const mockTerminateRunners = vi.mocked(terminateRunner); export interface TestData { @@ -312,7 +314,7 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it(`Should terminate orphan.`, async () => { + it(`Should terminate orphan (Non JIT)`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); const idleRunner = createRunnerTestData('idle-1', type, MINIMUM_BOOT_TIME + 1, true, false, false); @@ -334,6 +336,7 @@ describe('Scale down runners', () => { Value: 'true', }, ]); + expect(mockTagRunners).not.toHaveBeenCalledWith(idleRunner.instanceId, expect.anything()); // next cycle, update test data set orphan to true and terminate should be true @@ -348,6 +351,58 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); + it('Should test if orphaned runner, untag if online and busy, else terminate (JIT)', async () => { + // arrange + const orphanRunner = createRunnerTestData( + 'orphan-jit', + type, + MINIMUM_BOOT_TIME + 1, + false, + true, + false, + undefined, + 1234567890, + ); + const runners = [orphanRunner]; + + mockGitHubRunners([]); + mockAwsRunners(runners); + + if (type === 'Repo') { + mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ + data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + }); + } else { + mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ + data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + }); + } + + // act + await scaleDown(); + + // assert + expect(mockUntagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + expect(mockTerminateRunners).not.toHaveBeenCalledWith(orphanRunner.instanceId); + + // arrange + if (type === 'Repo') { + mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + }); + } else { + mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + }); + } + + // act + await scaleDown(); + + // assert + expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId); + }); + it(`Should ignore errors when termination orphan fails.`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true); @@ -625,6 +680,7 @@ function createRunnerTestData( orphan: boolean, shouldBeTerminated: boolean, owner?: string, + runnerId?: number, ): RunnerTestItem { return { instanceId: `i-${name}-${type.toLowerCase()}`, @@ -638,5 +694,6 @@ function createRunnerTestData( registered, orphan, shouldBeTerminated, + runnerId: runnerId !== undefined ? String(runnerId) : undefined, }; } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 611d4d217c..8f5cbd42d4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -1,9 +1,10 @@ import { Octokit } from '@octokit/rest'; +import { Endpoints } from '@octokit/types'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; +import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; @@ -12,6 +13,10 @@ import { getGitHubEnterpriseApiUrl } from './scale-up'; const logger = createChildLogger('scale-down'); +type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners']; +type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners']; +type RunnerState = OrgRunnerList[number] | RepoRunnerList[number]; + async function getOrCreateOctokit(runner: RunnerInfo): Promise { const key = runner.owner; const cachedOctokit = githubCache.clients.get(key); @@ -46,7 +51,11 @@ async function getOrCreateOctokit(runner: RunnerInfo): Promise { return octokit; } -async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { +async function getGitHubSelfHostedRunnerState( + client: Octokit, + ec2runner: RunnerInfo, + runnerId: number, +): Promise { const state = ec2runner.type === 'Org' ? await client.actions.getSelfHostedRunnerForOrg({ @@ -58,12 +67,15 @@ async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, owner: ec2runner.owner.split('/')[0], repo: ec2runner.owner.split('/')[1], }); - - logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`); - metricGitHubAppRateLimit(state.headers); - return state.data.busy; + return state.data; +} + +async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { + const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); + logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`); + return state.busy; } async function listGitHubRunners(runner: RunnerInfo): Promise { @@ -194,24 +206,59 @@ async function evaluateAndRemoveRunners( async function markOrphan(instanceId: string): Promise { try { await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); - logger.info(`Runner '${instanceId}' marked as orphan.`); + logger.info(`Runner '${instanceId}' tagged as orphan.`); } catch (e) { - logger.error(`Failed to mark runner '${instanceId}' as orphan.`, { error: e }); + logger.error(`Failed to tag runner '${instanceId}' as orphan.`, { error: e }); } } +async function unMarkOrphan(instanceId: string): Promise { + try { + await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + logger.info(`Runner '${instanceId}' untagged as orphan.`); + } catch (e) { + logger.error(`Failed to un-tag runner '${instanceId}' as orphan.`, { error: e }); + } +} + +async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise { + const client = await getOrCreateOctokit(runner as RunnerInfo); + const runnerId = parseInt(runner.runnerId || '0'); + const ec2Instance = runner as RunnerInfo; + const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); + let isOrphan = false; + logger.debug( + `Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`, + ); + const isOfflineAndBusy = state.status === 'offline' && state.busy; + if (isOfflineAndBusy) { + isOrphan = true; + } + logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`); + return isOrphan; +} + async function terminateOrphan(environment: string): Promise { try { const orphanRunners = await listEC2Runners({ environment, orphan: true }); for (const runner of orphanRunners) { - logger.info(`Terminating orphan runner '${runner.instanceId}'`); - await terminateRunner(runner.instanceId).catch((e) => { - logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); - }); + if (runner.runnerId) { + const isOrphan = await lastChanceCheckOrphanRunner(runner); + if (isOrphan) { + await terminateRunner(runner.instanceId); + } else { + await unMarkOrphan(runner.instanceId); + } + } else { + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } } } catch (e) { - logger.warn(`Failure during orphan runner termination.`, { error: e }); + logger.warn(`Failure during orphan termination processing.`, { error: e }); } } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 0611a6e697..14c0a0422e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -41,6 +41,7 @@ vi.mock('@octokit/rest', () => ({ vi.mock('./../aws/runners', async () => ({ createRunner: vi.fn(), listEC2Runners: vi.fn(), + tag: vi.fn(), })); vi.mock('./../github/auth', async () => ({ @@ -645,7 +646,7 @@ describe('scaleUp with public GH', () => { }); }); - it('JIT config is ingored for non-ephemeral runners.', async () => { + it('JIT config is ignored for non-ephemeral runners.', async () => { process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; process.env.ENABLE_JIT_CONFIG = 'true'; process.env.ENABLE_JOB_QUEUED_CHECK = 'false'; @@ -1008,11 +1009,13 @@ function defaultOctokitMockImpl() { ]); mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(() => ({ data: { + runner: { id: 9876543210 }, encoded_jit_config: 'TEST_JIT_CONFIG_ORG', }, })); mockOctokit.actions.generateRunnerJitconfigForRepo.mockImplementation(() => ({ data: { + runner: { id: 9876543210 }, encoded_jit_config: 'TEST_JIT_CONFIG_REPO', }, })); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 08d16d682a..638edd3232 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -4,7 +4,7 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { createRunner, listEC2Runners } from './../aws/runners'; +import { createRunner, listEC2Runners, tag } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import ScaleError from './ScaleError'; import { publishRetryMessage } from './job-retry'; @@ -416,6 +416,14 @@ async function createRegistrationTokenConfig( } } +async function tagRunnerId(instanceId: string, runnerId: string): Promise { + try { + await tag(instanceId, [{ Key: 'ghr:github_runner_id', Value: runnerId }]); + } catch (e) { + logger.error(`Failed to mark runner '${instanceId}' with ${runnerId}.`, { error: e }); + } +} + async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) { const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); const { isDelay, delay } = addDelay(instances); @@ -449,6 +457,9 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins metricGitHubAppRateLimit(runnerConfig.headers); + // tag the EC2 instance with the Github runner id + await tagRunnerId(instance, runnerConfig.data.runner.id.toString()); + // store jit config in ssm parameter store logger.debug('Runner JIT config for ephemeral runner generated.', { instance: instance, diff --git a/mkdocs.yaml b/mkdocs.yaml index d4bb359f42..9b98e84a36 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -46,6 +46,11 @@ markdown_extensions: - admonition - pymdownx.details - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format nav: - Introduction: index.md diff --git a/modules/runners/README.md b/modules/runners/README.md index 397236881d..34ebb61694 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -18,6 +18,8 @@ The scale up lambda is triggered by events on a SQS queue. Events on this queue The scale down lambda is triggered via a CloudWatch event. The event is triggered by a cron expression defined in the variable `scale_down_schedule_expression` (https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html). For scaling down GitHub does not provide a good API yet, therefore we run the scaling down based on this event every x minutes. Each time the lambda is triggered it tries to remove all runners older than x minutes (configurable) managed in this deployment. In case the runner can be removed from GitHub, which means it is not executing a workflow, the lambda will terminate the EC2 instance. +--8<-- "modules/runners/scale-down-state-diagram.md:mkdocs_scale_down_state_diagram" + ## Lambda Function The Lambda function is written in [TypeScript](https://www.typescriptlang.org/) and requires Node 12.x and yarn. Sources are located in [./lambdas/runners]. Two lambda functions share the same sources, there is one entry point for `scaleDown` and another one for `scaleUp`. @@ -85,6 +87,7 @@ yarn run dist | [aws_iam_role.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.create_tag](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.describe_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.dist_bucket](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | diff --git a/modules/runners/policies-runner.tf b/modules/runners/policies-runner.tf index d1b9190930..d923c143cb 100644 --- a/modules/runners/policies-runner.tf +++ b/modules/runners/policies-runner.tf @@ -57,6 +57,12 @@ resource "aws_iam_role_policy" "describe_tags" { policy = file("${path.module}/policies/instance-describe-tags-policy.json") } +resource "aws_iam_role_policy" "create_tag" { + name = "runner-create-tags" + role = aws_iam_role.runner.name + policy = templatefile("${path.module}/policies/instance-create-tags-policy.json", {}) +} + resource "aws_iam_role_policy_attachment" "managed_policies" { count = length(var.runner_iam_role_managed_policy_arns) role = aws_iam_role.runner.name diff --git a/modules/runners/policies/instance-create-tags-policy.json b/modules/runners/policies/instance-create-tags-policy.json new file mode 100644 index 0000000000..9da09fcb70 --- /dev/null +++ b/modules/runners/policies/instance-create-tags-policy.json @@ -0,0 +1,20 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": "ec2:CreateTags", + "Condition": { + "ForAllValues:StringEquals": { + "aws:TagKeys": [ + "ghr:github_runner_id" + ] + }, + "StringEquals": { + "aws:ARN": "$${ec2:SourceInstanceARN}" + } + }, + "Effect": "Allow", + "Resource": "arn:*:ec2:*:*:instance/*" + } + ] +} diff --git a/modules/runners/policies/lambda-scale-down.json b/modules/runners/policies/lambda-scale-down.json index 0f73aeacf1..d35be746b7 100644 --- a/modules/runners/policies/lambda-scale-down.json +++ b/modules/runners/policies/lambda-scale-down.json @@ -15,7 +15,8 @@ "Effect": "Allow", "Action": [ "ec2:TerminateInstances", - "ec2:CreateTags" + "ec2:CreateTags", + "ec2:DeleteTags" ], "Resource": [ "*" @@ -30,7 +31,8 @@ "Effect": "Allow", "Action": [ "ec2:TerminateInstances", - "ec2:CreateTags" + "ec2:CreateTags", + "ec2:DeleteTags" ], "Resource": [ "*" diff --git a/modules/runners/scale-down-state-diagram.md b/modules/runners/scale-down-state-diagram.md new file mode 100644 index 0000000000..b4f260eb2a --- /dev/null +++ b/modules/runners/scale-down-state-diagram.md @@ -0,0 +1,150 @@ +# GitHub Actions Runner Scale-Down State Diagram + + + +The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It performs a two-phase cleanup process: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances. + +```mermaid +stateDiagram-v2 + [*] --> ScheduledExecution : Cron Trigger every 5 min + + ScheduledExecution --> Phase1_OrphanTermination : Start Phase 1 + + state Phase1_OrphanTermination { + [*] --> ListOrphanInstances : Query EC2 for ghr orphan true + + ListOrphanInstances --> CheckOrphanType : For each orphan + + state CheckOrphanType <> + CheckOrphanType --> HasRunnerIdTag : Has ghr github runner id + CheckOrphanType --> TerminateOrphan : No runner ID tag + + HasRunnerIdTag --> LastChanceCheck : Query GitHub API + + state LastChanceCheck <> + LastChanceCheck --> ConfirmedOrphan : Offline and busy + LastChanceCheck --> FalsePositive : Exists and not problematic + + ConfirmedOrphan --> TerminateOrphan + FalsePositive --> RemoveOrphanTag + + TerminateOrphan --> NextOrphan : Continue processing + RemoveOrphanTag --> NextOrphan + + NextOrphan --> CheckOrphanType : More orphans? + NextOrphan --> Phase2_ActiveRunners : All processed + } + + Phase1_OrphanTermination --> Phase2_ActiveRunners : Phase 1 Complete + + state Phase2_ActiveRunners { + [*] --> ListActiveRunners : Query non-orphan EC2 instances + + ListActiveRunners --> GroupByOwner : Sort by owner and repo + + GroupByOwner --> ProcessOwnerGroup : For each owner + + state ProcessOwnerGroup { + [*] --> SortByStrategy : Apply eviction strategy + SortByStrategy --> ProcessRunner : Oldest first or newest first + + ProcessRunner --> QueryGitHub : Get GitHub runners for owner + + QueryGitHub --> MatchRunner : Find runner by instance ID suffix + + state MatchRunner <> + MatchRunner --> FoundInGitHub : Runner exists in GitHub + MatchRunner --> NotFoundInGitHub : Runner not in GitHub + + state FoundInGitHub { + [*] --> CheckMinimumTime : Has minimum runtime passed? + + state CheckMinimumTime <> + CheckMinimumTime --> TooYoung : Runtime less than minimum + CheckMinimumTime --> CheckIdleQuota : Runtime greater than or equal to minimum + + TooYoung --> NextRunner + + state CheckIdleQuota <> + CheckIdleQuota --> KeepIdle : Idle quota available + CheckIdleQuota --> CheckBusyState : Quota full + + KeepIdle --> NextRunner + + state CheckBusyState <> + CheckBusyState --> KeepBusy : Runner busy + CheckBusyState --> TerminateIdle : Runner idle + + KeepBusy --> NextRunner + TerminateIdle --> DeregisterFromGitHub + DeregisterFromGitHub --> TerminateInstance + TerminateInstance --> NextRunner + } + + state NotFoundInGitHub { + [*] --> CheckBootTime : Has boot time exceeded? + + state CheckBootTime <> + CheckBootTime --> StillBooting : Boot time less than threshold + CheckBootTime --> MarkOrphan : Boot time greater than or equal to threshold + + StillBooting --> NextRunner + MarkOrphan --> TagAsOrphan : Set ghr orphan true + TagAsOrphan --> NextRunner + } + + NextRunner --> ProcessRunner : More runners in group? + NextRunner --> NextOwnerGroup : Group complete + } + + NextOwnerGroup --> ProcessOwnerGroup : More owner groups? + NextOwnerGroup --> ExecutionComplete : All groups processed + } + + Phase2_ActiveRunners --> ExecutionComplete : Phase 2 Complete + + ExecutionComplete --> [*] : Wait for next cron trigger + + note right of LastChanceCheck + Uses ghr github runner id tag + for precise GitHub API lookup + end note + + note right of MatchRunner + Matches GitHub runner name + ending with EC2 instance ID + end note + + note right of CheckMinimumTime + Minimum running time in minutes + (Linux: 5min, Windows: 15min) + end note + + note right of CheckBootTime + Runner boot time in minutes + Default configuration value + end note +``` + + + +## Key Decision Points + +| State | Condition | Action | +|-------|-----------|--------| +| **Orphan w/ Runner ID** | GitHub: offline + busy | Terminate (confirmed orphan) | +| **Orphan w/ Runner ID** | GitHub: exists + healthy | Remove orphan tag (false positive) | +| **Orphan w/o Runner ID** | Always | Terminate (no way to verify) | +| **Active Runner Found** | Runtime < minimum | Keep (too young) | +| **Active Runner Found** | Idle quota available | Keep as idle | +| **Active Runner Found** | Quota full + idle | Terminate + deregister | +| **Active Runner Found** | Quota full + busy | Keep running | +| **Active Runner Missing** | Boot time exceeded | Mark as orphan | +| **Active Runner Missing** | Still booting | Wait | + +## Configuration Parameters + +- **Cron Schedule**: `cron(*/5 * * * ? *)` (every 5 minutes) +- **Minimum Runtime**: Linux 5min, Windows 15min +- **Boot Timeout**: Configurable via `runner_boot_time_in_minutes` +- **Idle Config**: Per-environment configuration for desired idle runners diff --git a/modules/runners/templates/start-runner.ps1 b/modules/runners/templates/start-runner.ps1 index 1ced28dcba..ae2eeff3c9 100644 --- a/modules/runners/templates/start-runner.ps1 +++ b/modules/runners/templates/start-runner.ps1 @@ -1,6 +1,44 @@ ## Retrieve instance metadata +function Tag-InstanceWithRunnerId { + Write-Host "Checking for .runner file to extract agent ID" + + $runnerFilePath = "$pwd\.runner" + if (-not (Test-Path $runnerFilePath)) { + Write-Host "Warning: .runner file not found" + return $true + } + + Write-Host "Found .runner file, extracting agent ID" + try { + $runnerConfig = Get-Content $runnerFilePath | ConvertFrom-Json + $agentId = $runnerConfig.agentId + + if (-not $agentId -or $agentId -eq $null) { + Write-Host "Warning: Could not extract agent ID from .runner file" + return $true + } + + Write-Host "Tagging instance with GitHub runner agent ID: $agentId" + $tagResult = aws ec2 create-tags --region "$Region" --resources "$InstanceId" --tags "Key=ghr:github_runner_id,Value=$agentId" 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host "Successfully tagged instance with agent ID: $agentId" + return $true + } else { + Write-Host "Warning: Failed to tag instance with agent ID - $tagResult" + return $true + } + } + catch { + Write-Host "Warning: Error processing .runner file - $($_.Exception.Message)" + return $true + } +} + +## Retrieve instance metadata + Write-Host "Retrieving TOKEN from AWS API" $token=Invoke-RestMethod -Method PUT -Uri "http://169.254.169.254/latest/api/token" -Headers @{"X-aws-ec2-metadata-token-ttl-seconds" = "180"} if ( ! $token ) { @@ -122,6 +160,9 @@ if ($enable_jit_config -eq "false" -or $agent_mode -ne "ephemeral") { $configCmd = ".\config.cmd --unattended --name $runner_name_prefix$InstanceId --work `"_work`" $runnerExtraOptions $config" Write-Host "Configure GH Runner (non ephmeral / no JIT) as user $run_as" Invoke-Expression $configCmd + + # Tag instance with GitHub runner agent ID for non-JIT runners + Tag-InstanceWithRunnerId } $jsonBody = @( diff --git a/modules/runners/templates/start-runner.sh b/modules/runners/templates/start-runner.sh index 1c1f3d5e9f..7f2c0f82c5 100644 --- a/modules/runners/templates/start-runner.sh +++ b/modules/runners/templates/start-runner.sh @@ -58,6 +58,36 @@ create_xray_error_segment() { echo "$SEGMENT_DOC" } +tag_instance_with_runner_id() { + echo "Checking for .runner file to extract agent ID" + + if [[ ! -f "/opt/actions-runner/.runner" ]]; then + echo "Warning: .runner file not found" + return 0 + fi + + echo "Found .runner file, extracting agent ID" + local agent_id + agent_id=$(jq -r '.agentId' /opt/actions-runner/.runner 2>/dev/null || echo "") + + if [[ -z "$agent_id" || "$agent_id" == "null" ]]; then + echo "Warning: Could not extract agent ID from .runner file" + return 0 + fi + + echo "Tagging instance with GitHub runner agent ID: $agent_id" + if aws ec2 create-tags \ + --region "$region" \ + --resources "$instance_id" \ + --tags Key=ghr:github_runner_id,Value="$agent_id"; then + echo "Successfully tagged instance with agent ID: $agent_id" + return 0 + else + echo "Warning: Failed to tag instance with agent ID" + return 0 + fi +} + cleanup() { local exit_code="$1" local error_location="$2" @@ -225,6 +255,9 @@ if [[ "$enable_jit_config" == "false" || $agent_mode != "ephemeral" ]]; then extra_flags="" fi sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./config.sh $${extra_flags} --unattended --name "$runner_name_prefix$instance_id" --work "_work" $${config} + + # Tag instance with GitHub runner agent ID for non-JIT runners + tag_instance_with_runner_id fi create_xray_success_segment "$SEGMENT"