From 3fa09d24ddef8ab03feb3f2386b17a165eafc75d Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 21 May 2025 18:39:55 +0200 Subject: [PATCH 01/38] feat(runners): add runnerId to RunnerList and implement untag functionality for incorrectly tagged runners --- .../control-plane/src/aws/runners.d.ts | 8 +++ .../control-plane/src/aws/runners.test.ts | 67 +++++++++++++++++-- .../control-plane/src/aws/runners.ts | 8 +++ .../src/scale-runners/scale-down.test.ts | 54 ++++++++++++++- .../src/scale-runners/scale-down.ts | 52 ++++++++++---- .../src/scale-runners/scale-up.test.ts | 5 +- .../src/scale-runners/scale-up.ts | 18 ++++- 7 files changed, 192 insertions(+), 20 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 3a1b31b1cf..752ff07edd 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -10,6 +10,14 @@ export interface RunnerList { repo?: string; org?: string; orphan?: boolean; + runnerId?: string; +} + +export interface RunnerState { + id: number; + name: string; + busy: boolean; + status: string; } export interface RunnerInfo { diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index b927f98696..008f29433f 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -4,6 +4,7 @@ import { CreateFleetInstance, CreateFleetResult, CreateTagsCommand, + DeleteTagsCommand, DefaultTargetCapacityType, DescribeInstancesCommand, DescribeInstancesResult, @@ -17,7 +18,7 @@ import { mockClient } from 'aws-sdk-client-mock'; import 'aws-sdk-client-mock-jest/vitest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, tag, terminateRunner } from './runners'; +import { createRunner, listEC2Runners, tag, untag, terminateRunner } from './runners'; import { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -53,6 +54,26 @@ const mockRunningInstances: DescribeInstancesResult = { }, ], }; +const mockRunningInstancesJit: DescribeInstancesResult = { + Reservations: [ + { + Instances: [ + { + LaunchTime: new Date('2020-10-10T14:48:00.000+09:00'), + InstanceId: 'i-1234', + Tags: [ + { Key: 'ghr:Application', Value: 'github-action-runner' }, + { Key: 'ghr:runner_name_prefix', Value: RUNNER_NAME_PREFIX }, + { Key: 'ghr:created_by', Value: 'scale-up-lambda' }, + { Key: 'ghr:Type', Value: 'Org' }, + { Key: 'ghr:Owner', Value: 'CoderToCat' }, + { Key: 'ghr:githubrunnerid', Value: '9876543210' }, + ], + }, + ], + }, + ], +}; describe('list instances', () => { beforeEach(() => { @@ -60,7 +81,7 @@ describe('list instances', () => { vi.clearAllMocks(); }); - it('returns a list of instances', async () => { + it('returns a list of instances (Non JIT)', async () => { mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances); const resp = await listEC2Runners(); expect(resp.length).toBe(1); @@ -73,6 +94,20 @@ describe('list instances', () => { }); }); + it('returns a list of instances (JIT)', async () => { + mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstancesJit); + const resp = await listEC2Runners(); + expect(resp.length).toBe(1); + expect(resp).toContainEqual({ + instanceId: 'i-1234', + launchTime: new Date('2020-10-10T14:48:00.000+09:00'), + type: 'Org', + owner: 'CoderToCat', + orphan: false, + runnerId: '9876543210', + }); + }); + it('check orphan tag.', async () => { const instances: DescribeInstancesResult = mockRunningInstances; instances.Reservations![0].Instances![0].Tags!.push({ Key: 'ghr:orphan', Value: 'true' }); @@ -229,11 +264,35 @@ describe('tag runner', () => { owner: 'owner-2', type: 'Repo', }; - await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'truer' }]); + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + + expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], + }); + }); +}); +describe('untag runner', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + it('removing extra tag', async () => { + mockEC2Client.on(DeleteTagsCommand).resolves({}); + const runner: RunnerInfo = { + instanceId: 'instance-2', + owner: 'owner-2', + type: 'Repo', + }; + //await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: '' }]); expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { Resources: [runner.instanceId], - Tags: [{ Key: 'ghr:orphan', Value: 'truer' }], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], + }); + await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + expect(mockEC2Client).toHaveReceivedCommandWith(DeleteTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], }); }); }); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index dfe4d99fcf..c8fb1465f7 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -2,6 +2,7 @@ import { CreateFleetCommand, CreateFleetResult, CreateTagsCommand, + DeleteTagsCommand, DescribeInstancesCommand, DescribeInstancesResult, EC2Client, @@ -91,6 +92,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', + runnerId: i.Tags?.find((e) => e.Key === 'ghr:githubrunnerid')?.Value as string, }); } } @@ -112,6 +114,12 @@ export async function tag(instanceId: string, tags: Tag[]): Promise { await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); } +export async function untag(instanceId: string, tags: Tag[]): Promise { + logger.debug(`Untagging '${instanceId}'`, { tags }); + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + await ec2.send(new DeleteTagsCommand({ Resources: [instanceId], Tags: tags })); +} + function generateFleetOverrides( subnetIds: string[], instancesTypes: string[], diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 87bab093cb..7823d0ec76 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -4,7 +4,7 @@ import nock from 'nock'; import { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../github/auth'; -import { listEC2Runners, terminateRunner, tag } from './../aws/runners'; +import { listEC2Runners, terminateRunner, tag, untag } from './../aws/runners'; import { githubCache } from './cache'; import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -33,6 +33,7 @@ vi.mock('./../aws/runners', async (importOriginal) => { return { ...actual, tag: vi.fn(), + untag: vi.fn(), terminateRunner: vi.fn(), listEC2Runners: vi.fn(), }; @@ -62,6 +63,7 @@ const mockedInstallationAuth = vi.mocked(ghAuth.createGithubInstallationAuth); const mockCreateClient = vi.mocked(ghAuth.createOctokitClient); const mockListRunners = vi.mocked(listEC2Runners); const mockTagRunners = vi.mocked(tag); +const mockUntagRunners = vi.mocked(untag); const mockTerminateRunners = vi.mocked(terminateRunner); export interface TestData { @@ -312,7 +314,7 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it(`Should terminate orphan.`, async () => { + it(`Should terminate orphan (Non JIT)`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); const idleRunner = createRunnerTestData('idle-1', type, MINIMUM_BOOT_TIME + 1, true, false, false); @@ -334,6 +336,7 @@ describe('Scale down runners', () => { Value: 'true', }, ]); + expect(mockTagRunners).not.toHaveBeenCalledWith(idleRunner.instanceId, expect.anything()); // next cycle, update test data set orphan to true and terminate should be true @@ -348,6 +351,51 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); + it('Should test if orphaned runner untag if online and busy, else terminate (JIT)', async () => { + // arrange + const orphanRunner = createRunnerTestData('orphan-jit', type, MINIMUM_BOOT_TIME + 1, false, true, false, undefined, 1234567890); + const runners = [orphanRunner]; + + mockGitHubRunners([]); + mockAwsRunners(runners); + + if (type === 'Repo') { + mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + }); + } else { + mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + }); + } + + // act + await scaleDown(); + + // assert + expect(mockUntagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [ + { Key: 'ghr:orphan', Value: 'true' }, + ]); + expect(mockTerminateRunners).not.toHaveBeenCalledWith(orphanRunner.instanceId); + + // arrange + if (type === 'Repo') { + mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + }); + } else { + mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + }); + } + + // act + await scaleDown(); + + // assert + expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId); + }); + it(`Should ignore errors when termination orphan fails.`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true); @@ -625,6 +673,7 @@ function createRunnerTestData( orphan: boolean, shouldBeTerminated: boolean, owner?: string, + runnerId?: number, ): RunnerTestItem { return { instanceId: `i-${name}-${type.toLowerCase()}`, @@ -638,5 +687,6 @@ function createRunnerTestData( registered, orphan, shouldBeTerminated, + runnerId: runnerId !== undefined ? String(runnerId) : undefined, }; } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 10d5615b43..71820d9d5e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -3,8 +3,8 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; -import { RunnerInfo, RunnerList } from './../aws/runners.d'; +import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; +import { RunnerInfo, RunnerList, RunnerState } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; @@ -46,7 +46,11 @@ async function getOrCreateOctokit(runner: RunnerInfo): Promise { return octokit; } -async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { +async function getGitHubSelfHostedRunnerState( + client: Octokit, + ec2runner: RunnerInfo, + runnerId: number, +): Promise { const state = ec2runner.type === 'Org' ? await client.actions.getSelfHostedRunnerForOrg({ @@ -59,11 +63,12 @@ async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, repo: ec2runner.owner.split('/')[1], }); - logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`); - - metricGitHubAppRateLimit(state.headers); + return { id: state.data.id, name: state.data.name, busy: state.data.busy, status: state.data.status }; +} - return state.data.busy; +async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { + const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); + return state.busy; } async function listGitHubRunners(runner: RunnerInfo): Promise { @@ -205,13 +210,36 @@ async function terminateOrphan(environment: string): Promise { const orphanRunners = await listEC2Runners({ environment, orphan: true }); for (const runner of orphanRunners) { - logger.info(`Terminating orphan runner '${runner.instanceId}'`); - await terminateRunner(runner.instanceId).catch((e) => { - logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); - }); + // do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method + logger.info(`Runner var us '${JSON.stringify(runner)}' `); + if (runner.runnerId) { + logger.info(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); + // build a runner instance + const client = await getOrCreateOctokit(runner as RunnerInfo); + const runnerId = parseInt(runner.runnerId); + const ec2Instance = runner as RunnerInfo; + const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); + logger.info(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); + if (state.status === 'online' && state.busy) { + logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); + await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + } else if (state.status === 'offline') { + logger.warn(`Runner '${runner.instanceId}' is orphan.`); + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } + } else { + logger.warn(`Runner '${runner.instanceId}' is orphan, but no runnerId found.`); + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } } } catch (e) { - logger.warn(`Failure during orphan runner termination.`, { error: e }); + logger.warn(`Failure during orphan termination processing.`, { error: e }); } } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 0611a6e697..14c0a0422e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -41,6 +41,7 @@ vi.mock('@octokit/rest', () => ({ vi.mock('./../aws/runners', async () => ({ createRunner: vi.fn(), listEC2Runners: vi.fn(), + tag: vi.fn(), })); vi.mock('./../github/auth', async () => ({ @@ -645,7 +646,7 @@ describe('scaleUp with public GH', () => { }); }); - it('JIT config is ingored for non-ephemeral runners.', async () => { + it('JIT config is ignored for non-ephemeral runners.', async () => { process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; process.env.ENABLE_JIT_CONFIG = 'true'; process.env.ENABLE_JOB_QUEUED_CHECK = 'false'; @@ -1008,11 +1009,13 @@ function defaultOctokitMockImpl() { ]); mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(() => ({ data: { + runner: { id: 9876543210 }, encoded_jit_config: 'TEST_JIT_CONFIG_ORG', }, })); mockOctokit.actions.generateRunnerJitconfigForRepo.mockImplementation(() => ({ data: { + runner: { id: 9876543210 }, encoded_jit_config: 'TEST_JIT_CONFIG_REPO', }, })); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 08d16d682a..faf9e4aece 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -4,11 +4,12 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { createRunner, listEC2Runners } from './../aws/runners'; +import { createRunner, listEC2Runners, tag } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import ScaleError from './ScaleError'; import { publishRetryMessage } from './job-retry'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; +import { run } from '../local-down'; const logger = createChildLogger('scale-up'); @@ -416,6 +417,15 @@ async function createRegistrationTokenConfig( } } +async function addGhRunnerIdToEC2InstanceTag(instanceId: string, runnerId: string): Promise { + try { + await tag(instanceId, [{ Key: 'ghr:githubrunnerid', Value: runnerId }]); + logger.info(`Runner '${instanceId}' marked with ${runnerId}.`); + } catch (e) { + logger.error(`Failed to mark runner '${instanceId}' with ${runnerId}.`, { error: e }); + } +} + async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) { const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); const { isDelay, delay } = addDelay(instances); @@ -449,6 +459,12 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins metricGitHubAppRateLimit(runnerConfig.headers); + // tag the EC2 instance with the Github runner id + logger.debug('Tagging instance with GitHub runner ID', { + runnerId: runnerConfig.data.runner.id, + }); + await addGhRunnerIdToEC2InstanceTag(instance, runnerConfig.data.runner.id.toString()); + // store jit config in ssm parameter store logger.debug('Runner JIT config for ephemeral runner generated.', { instance: instance, From 14df9fe6c22371a321f5888551d44e04922888ad Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Thu, 22 May 2025 09:22:04 +0200 Subject: [PATCH 02/38] fix(tests): improve clarity of orphaned runner untagging test description --- .../control-plane/src/scale-runners/scale-down.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 7823d0ec76..212a4ebe0d 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -351,7 +351,7 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it('Should test if orphaned runner untag if online and busy, else terminate (JIT)', async () => { + it('Should test if orphaned runner, untag if online and busy, else terminate (JIT)', async () => { // arrange const orphanRunner = createRunnerTestData('orphan-jit', type, MINIMUM_BOOT_TIME + 1, false, true, false, undefined, 1234567890); const runners = [orphanRunner]; From 9d7c89a339793f0ee2353e2033a5d03a6181d04a Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Thu, 22 May 2025 10:40:07 +0200 Subject: [PATCH 03/38] fmt --- .../src/scale-runners/scale-down.test.ts | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 212a4ebe0d..318808b95c 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -351,9 +351,18 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it('Should test if orphaned runner, untag if online and busy, else terminate (JIT)', async () => { + it('Should test if orphaned runner, untag if online and busy, else terminate (JIT)', async () => { // arrange - const orphanRunner = createRunnerTestData('orphan-jit', type, MINIMUM_BOOT_TIME + 1, false, true, false, undefined, 1234567890); + const orphanRunner = createRunnerTestData( + 'orphan-jit', + type, + MINIMUM_BOOT_TIME + 1, + false, + true, + false, + undefined, + 1234567890, + ); const runners = [orphanRunner]; mockGitHubRunners([]); @@ -373,9 +382,7 @@ describe('Scale down runners', () => { await scaleDown(); // assert - expect(mockUntagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [ - { Key: 'ghr:orphan', Value: 'true' }, - ]); + expect(mockUntagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); expect(mockTerminateRunners).not.toHaveBeenCalledWith(orphanRunner.instanceId); // arrange From e09337f34888361e7720c0465c176ede63ad3c78 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Thu, 22 May 2025 10:42:55 +0200 Subject: [PATCH 04/38] fix(scale-down): remove unnecessary logging of runner variable in terminateOrphan function --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 71820d9d5e..d384744a12 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -211,7 +211,6 @@ async function terminateOrphan(environment: string): Promise { for (const runner of orphanRunners) { // do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method - logger.info(`Runner var us '${JSON.stringify(runner)}' `); if (runner.runnerId) { logger.info(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); // build a runner instance From 9064512e58620cb0a06c3b5eec2df8fedf818142 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Thu, 22 May 2025 10:46:15 +0200 Subject: [PATCH 05/38] fix(scale-up): remove unused import of run function --- lambdas/functions/control-plane/src/scale-runners/scale-up.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index faf9e4aece..747417cb8f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -9,7 +9,6 @@ import { RunnerInputParameters } from './../aws/runners.d'; import ScaleError from './ScaleError'; import { publishRetryMessage } from './job-retry'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; -import { run } from '../local-down'; const logger = createChildLogger('scale-up'); From 716e0790fb557b2d3202ab0b51cdaf410788da84 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Fri, 23 May 2025 08:50:26 +0200 Subject: [PATCH 06/38] fix(scale-down): remove unused import of metricGitHubAppRateLimit --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index d384744a12..0ae8a87ee2 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -7,7 +7,6 @@ import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from '. import { RunnerInfo, RunnerList, RunnerState } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; -import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { getGitHubEnterpriseApiUrl } from './scale-up'; const logger = createChildLogger('scale-down'); From a5fcc88e8d95e147cd8fb5c9824501087f76cdeb Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 26 May 2025 11:34:36 +0200 Subject: [PATCH 07/38] Update lambdas/functions/control-plane/src/scale-runners/scale-down.ts Co-authored-by: Niek Palm --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 0ae8a87ee2..f48333e43b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -211,7 +211,7 @@ async function terminateOrphan(environment: string): Promise { for (const runner of orphanRunners) { // do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method if (runner.runnerId) { - logger.info(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); + logger.debug(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); // build a runner instance const client = await getOrCreateOctokit(runner as RunnerInfo); const runnerId = parseInt(runner.runnerId); From bb8ba2bf56af9d461eaec6befa5b68f38a7bcd69 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 26 May 2025 11:34:49 +0200 Subject: [PATCH 08/38] Update lambdas/functions/control-plane/src/scale-runners/scale-down.ts Co-authored-by: Niek Palm --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index f48333e43b..9a02e79ec4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -217,7 +217,7 @@ async function terminateOrphan(environment: string): Promise { const runnerId = parseInt(runner.runnerId); const ec2Instance = runner as RunnerInfo; const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); - logger.info(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); + logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); if (state.status === 'online' && state.busy) { logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); From 97de234cdaf3151e6f8a957d202a03cbd92b9f10 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 26 May 2025 11:35:34 +0200 Subject: [PATCH 09/38] Update lambdas/functions/control-plane/src/scale-runners/scale-up.ts Co-authored-by: Niek Palm --- lambdas/functions/control-plane/src/scale-runners/scale-up.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 747417cb8f..cc1aefc8c7 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -459,9 +459,6 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins metricGitHubAppRateLimit(runnerConfig.headers); // tag the EC2 instance with the Github runner id - logger.debug('Tagging instance with GitHub runner ID', { - runnerId: runnerConfig.data.runner.id, - }); await addGhRunnerIdToEC2InstanceTag(instance, runnerConfig.data.runner.id.toString()); // store jit config in ssm parameter store From 8b61d39203e6f4b92dcddc6109dc5c872ac5d603 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 26 May 2025 11:35:59 +0200 Subject: [PATCH 10/38] Update lambdas/functions/control-plane/src/scale-runners/scale-up.ts Co-authored-by: Niek Palm --- lambdas/functions/control-plane/src/scale-runners/scale-up.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index cc1aefc8c7..38485d48f5 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -418,7 +418,7 @@ async function createRegistrationTokenConfig( async function addGhRunnerIdToEC2InstanceTag(instanceId: string, runnerId: string): Promise { try { - await tag(instanceId, [{ Key: 'ghr:githubrunnerid', Value: runnerId }]); + await tag(instanceId, [{ Key: 'ghr:github_runner_id', Value: runnerId }]); logger.info(`Runner '${instanceId}' marked with ${runnerId}.`); } catch (e) { logger.error(`Failed to mark runner '${instanceId}' with ${runnerId}.`, { error: e }); From 9883b0b8490cde1dcfdb4ba5a623c2fe693aa450 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Thu, 29 May 2025 09:03:34 +0100 Subject: [PATCH 11/38] Remove warning log for orphan runners without runnerId in scale-down function --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 9a02e79ec4..f797735d53 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -229,7 +229,6 @@ async function terminateOrphan(environment: string): Promise { }); } } else { - logger.warn(`Runner '${runner.instanceId}' is orphan, but no runnerId found.`); logger.info(`Terminating orphan runner '${runner.instanceId}'`); await terminateRunner(runner.instanceId).catch((e) => { logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); From 43468a738990da4e4efc936d0beed252e1ff8ad2 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Thu, 29 May 2025 09:04:58 +0100 Subject: [PATCH 12/38] Remove logging of runner ID marking in addGhRunnerIdToEC2InstanceTag function --- lambdas/functions/control-plane/src/scale-runners/scale-up.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 38485d48f5..62da8b79b8 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -419,7 +419,6 @@ async function createRegistrationTokenConfig( async function addGhRunnerIdToEC2InstanceTag(instanceId: string, runnerId: string): Promise { try { await tag(instanceId, [{ Key: 'ghr:github_runner_id', Value: runnerId }]); - logger.info(`Runner '${instanceId}' marked with ${runnerId}.`); } catch (e) { logger.error(`Failed to mark runner '${instanceId}' with ${runnerId}.`, { error: e }); } From bc995ef3e0fb301a8b357c517f7d6e01680c08b9 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:05:02 +0100 Subject: [PATCH 13/38] readded metricGitHubAppRateLimit --- .../functions/control-plane/src/scale-runners/scale-down.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index f797735d53..54aa24a4d8 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -7,10 +7,12 @@ import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from '. import { RunnerInfo, RunnerList, RunnerState } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; +import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { getGitHubEnterpriseApiUrl } from './scale-up'; const logger = createChildLogger('scale-down'); + async function getOrCreateOctokit(runner: RunnerInfo): Promise { const key = runner.owner; const cachedOctokit = githubCache.clients.get(key); @@ -67,6 +69,8 @@ async function getGitHubSelfHostedRunnerState( async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); + logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`); + metricGitHubAppRateLimit(state.headers); return state.busy; } From 6e1c72ca3d0826535bba182dd7a088ed25866d66 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:47:52 +0100 Subject: [PATCH 14/38] Refactor runner interfaces: remove RunnerState interface and update imports in scale-down.ts --- .../functions/control-plane/src/aws/runners.d.ts | 7 ------- .../control-plane/src/scale-runners/scale-down.ts | 13 ++++++++++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 752ff07edd..72ff9e3e1a 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -13,13 +13,6 @@ export interface RunnerList { runnerId?: string; } -export interface RunnerState { - id: number; - name: string; - busy: boolean; - status: string; -} - export interface RunnerInfo { instanceId: string; launchTime?: Date; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 54aa24a4d8..df1527e6af 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -4,14 +4,25 @@ import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; -import { RunnerInfo, RunnerList, RunnerState } from './../aws/runners.d'; +import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { getGitHubEnterpriseApiUrl } from './scale-up'; +import { GetResponseDataTypeFromEndpointMethod } from '@octokit/types/dist-types/GetResponseTypeFromEndpointMethod'; const logger = createChildLogger('scale-down'); +type OrgRunnerList = GetResponseDataTypeFromEndpointMethod< + typeof Octokit.prototype.actions.listSelfHostedRunnersForOrg +>; + +type RepoRunnerList = GetResponseDataTypeFromEndpointMethod< + typeof Octokit.prototype.actions.listSelfHostedRunnersForRepo +>; + +// Derive the shape of an individual runner +type RunnerState = (OrgRunnerList | RepoRunnerList)['runners'][number]; async function getOrCreateOctokit(runner: RunnerInfo): Promise { const key = runner.owner; From 097c14d8c5b095777e24dd44942ea73942a9076a Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:13:47 +0100 Subject: [PATCH 15/38] Add headers to runner state return and update logging for busy state --- .../functions/control-plane/src/scale-runners/scale-down.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index df1527e6af..814c6928c0 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -75,12 +75,12 @@ async function getGitHubSelfHostedRunnerState( repo: ec2runner.owner.split('/')[1], }); - return { id: state.data.id, name: state.data.name, busy: state.data.busy, status: state.data.status }; + return { id: state.data.id, name: state.data.name, busy: state.data.busy, status: state.data.status, headers: state.headers }; } async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); - logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`); + logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`); metricGitHubAppRateLimit(state.headers); return state.busy; } From 971ec2d8b1deffbb2d01146aa5e4d7ec16c67877 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:14:25 +0100 Subject: [PATCH 16/38] Remove redundant comment describing RunnerState type --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 814c6928c0..631074f8a5 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -21,7 +21,6 @@ type RepoRunnerList = GetResponseDataTypeFromEndpointMethod< typeof Octokit.prototype.actions.listSelfHostedRunnersForRepo >; -// Derive the shape of an individual runner type RunnerState = (OrgRunnerList | RepoRunnerList)['runners'][number]; async function getOrCreateOctokit(runner: RunnerInfo): Promise { From 5a275e5bf706b519dc84fb07819257a642702cdf Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:25:04 +0100 Subject: [PATCH 17/38] Implement last chance check for orphan runners and refactor termination logic --- .../src/scale-runners/scale-down.ts | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 631074f8a5..4e5cd38b17 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -218,6 +218,24 @@ async function markOrphan(instanceId: string): Promise { } } +async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise { + const client = await getOrCreateOctokit(runner as RunnerInfo); + const runnerId = parseInt(runner.runnerId); + const ec2Instance = runner as RunnerInfo; + const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); + logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); + if (state.status === 'online' && state.busy) { + logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); + await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + } else if (state.status === 'offline') { + logger.warn(`Runner '${runner.instanceId}' is orphan.`); + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } +} + async function terminateOrphan(environment: string): Promise { try { const orphanRunners = await listEC2Runners({ environment, orphan: true }); @@ -226,22 +244,7 @@ async function terminateOrphan(environment: string): Promise { // do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method if (runner.runnerId) { logger.debug(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); - // build a runner instance - const client = await getOrCreateOctokit(runner as RunnerInfo); - const runnerId = parseInt(runner.runnerId); - const ec2Instance = runner as RunnerInfo; - const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); - logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); - if (state.status === 'online' && state.busy) { - logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); - await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); - } else if (state.status === 'offline') { - logger.warn(`Runner '${runner.instanceId}' is orphan.`); - logger.info(`Terminating orphan runner '${runner.instanceId}'`); - await terminateRunner(runner.instanceId).catch((e) => { - logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); - }); - } + await lastChanceCheckOrphanRunner(runner); } else { logger.info(`Terminating orphan runner '${runner.instanceId}'`); await terminateRunner(runner.instanceId).catch((e) => { From 9f59abe1668ef10ffcab27b301468d5dfa879656 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:32:49 +0100 Subject: [PATCH 18/38] Format return object in getGitHubSelfHostedRunnerState for improved readability --- .../control-plane/src/scale-runners/scale-down.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 4e5cd38b17..b257dd14f5 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -74,7 +74,13 @@ async function getGitHubSelfHostedRunnerState( repo: ec2runner.owner.split('/')[1], }); - return { id: state.data.id, name: state.data.name, busy: state.data.busy, status: state.data.status, headers: state.headers }; + return { + id: state.data.id, + name: state.data.name, + busy: state.data.busy, + status: state.data.status, + headers: state.headers, + }; } async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { From 65c0b0ee86cd46bb0ac9a92b8ddfefa5cd9e9cb8 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:57:24 +0100 Subject: [PATCH 19/38] Refactor runner state types to use Endpoints for improved type safety and clarity --- .../src/scale-runners/scale-down.ts | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index b257dd14f5..41d0e51495 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -1,4 +1,5 @@ import { Octokit } from '@octokit/rest'; +import { Endpoints } from '@octokit/types'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import moment from 'moment'; @@ -9,19 +10,12 @@ import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { getGitHubEnterpriseApiUrl } from './scale-up'; -import { GetResponseDataTypeFromEndpointMethod } from '@octokit/types/dist-types/GetResponseTypeFromEndpointMethod'; const logger = createChildLogger('scale-down'); -type OrgRunnerList = GetResponseDataTypeFromEndpointMethod< - typeof Octokit.prototype.actions.listSelfHostedRunnersForOrg ->; - -type RepoRunnerList = GetResponseDataTypeFromEndpointMethod< - typeof Octokit.prototype.actions.listSelfHostedRunnersForRepo ->; - -type RunnerState = (OrgRunnerList | RepoRunnerList)['runners'][number]; +type OrgRunnerList = Endpoints["GET /orgs/{org}/actions/runners"]["response"]["data"]["runners"]; +type RepoRunnerList = Endpoints["GET /repos/{owner}/{repo}/actions/runners"]["response"]["data"]["runners"]; +type RunnerState = OrgRunnerList[number] | RepoRunnerList[number]; async function getOrCreateOctokit(runner: RunnerInfo): Promise { const key = runner.owner; @@ -73,20 +67,23 @@ async function getGitHubSelfHostedRunnerState( owner: ec2runner.owner.split('/')[0], repo: ec2runner.owner.split('/')[1], }); + metricGitHubAppRateLimit(state.headers); return { id: state.data.id, name: state.data.name, busy: state.data.busy, status: state.data.status, - headers: state.headers, + os: state.data.os, + labels: state.data.labels, + runner_group_id: state.data.runner_group_id, + ephemeral: state.data.ephemeral, }; } async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`); - metricGitHubAppRateLimit(state.headers); return state.busy; } @@ -226,7 +223,7 @@ async function markOrphan(instanceId: string): Promise { async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise { const client = await getOrCreateOctokit(runner as RunnerInfo); - const runnerId = parseInt(runner.runnerId); + const runnerId = parseInt(runner.runnerId || '0'); const ec2Instance = runner as RunnerInfo; const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); From 8ead598a3fd69152ada71863e2f4df60c2546191 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:58:51 +0100 Subject: [PATCH 20/38] Fix formatting of type definitions and adjust indentation in getGitHubSelfHostedRunnerState for consistency --- .../functions/control-plane/src/scale-runners/scale-down.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 41d0e51495..1c8955d268 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -13,8 +13,8 @@ import { getGitHubEnterpriseApiUrl } from './scale-up'; const logger = createChildLogger('scale-down'); -type OrgRunnerList = Endpoints["GET /orgs/{org}/actions/runners"]["response"]["data"]["runners"]; -type RepoRunnerList = Endpoints["GET /repos/{owner}/{repo}/actions/runners"]["response"]["data"]["runners"]; +type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners']; +type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners']; type RunnerState = OrgRunnerList[number] | RepoRunnerList[number]; async function getOrCreateOctokit(runner: RunnerInfo): Promise { @@ -67,7 +67,7 @@ async function getGitHubSelfHostedRunnerState( owner: ec2runner.owner.split('/')[0], repo: ec2runner.owner.split('/')[1], }); - metricGitHubAppRateLimit(state.headers); + metricGitHubAppRateLimit(state.headers); return { id: state.data.id, From ab5b6b0b8b708eb8289f77396a55d6de6c4ce2c6 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 11:49:48 +0100 Subject: [PATCH 21/38] Update lambdas/functions/control-plane/src/aws/runners.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- lambdas/functions/control-plane/src/aws/runners.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index c8fb1465f7..6779dd39d2 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -92,7 +92,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', - runnerId: i.Tags?.find((e) => e.Key === 'ghr:githubrunnerid')?.Value as string, + runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string, }); } } From 0b462bbab9ad48084fa7d9696c1c96ccd6d54c2d Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 11:50:10 +0100 Subject: [PATCH 22/38] Update lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../control-plane/src/scale-runners/scale-down.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 318808b95c..8dd25323a6 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -370,11 +370,11 @@ describe('Scale down runners', () => { if (type === 'Repo') { mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ - data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, }); } else { mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ - data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, }); } From 102edf0afcba291f2e6ec288c3a432cb1628a915 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 4 Jun 2025 11:52:43 +0100 Subject: [PATCH 23/38] Fix typo in key for GitHub runner ID in mock running instances --- lambdas/functions/control-plane/src/aws/runners.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 008f29433f..9d218605bb 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -67,7 +67,7 @@ const mockRunningInstancesJit: DescribeInstancesResult = { { Key: 'ghr:created_by', Value: 'scale-up-lambda' }, { Key: 'ghr:Type', Value: 'Org' }, { Key: 'ghr:Owner', Value: 'CoderToCat' }, - { Key: 'ghr:githubrunnerid', Value: '9876543210' }, + { Key: 'ghr:github_runner_id', Value: '9876543210' }, ], }, ], From 898226d89095fedebd8aeba28d06e8323ae5bfd7 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 7 Jul 2025 11:04:31 +0200 Subject: [PATCH 24/38] fix: add missing ec2:RemoveTags action in lambda-scale-down policy --- modules/runners/policies/lambda-scale-down.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/runners/policies/lambda-scale-down.json b/modules/runners/policies/lambda-scale-down.json index 0f73aeacf1..549e397dea 100644 --- a/modules/runners/policies/lambda-scale-down.json +++ b/modules/runners/policies/lambda-scale-down.json @@ -15,7 +15,8 @@ "Effect": "Allow", "Action": [ "ec2:TerminateInstances", - "ec2:CreateTags" + "ec2:CreateTags", + "ec2:RemoveTags" ], "Resource": [ "*" @@ -30,7 +31,8 @@ "Effect": "Allow", "Action": [ "ec2:TerminateInstances", - "ec2:CreateTags" + "ec2:CreateTags", + "ec2:RemoveTags" ], "Resource": [ "*" From 8ff99019450ec34c33f8184436b8b889d9b376d7 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Tue, 8 Jul 2025 15:50:28 +0200 Subject: [PATCH 25/38] fix: streamline orphan runner handling and improve logging --- .../src/scale-runners/scale-down.ts | 39 ++++++++----------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 1eae0362a8..da31ff9ec6 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -69,16 +69,7 @@ async function getGitHubSelfHostedRunnerState( }); metricGitHubAppRateLimit(state.headers); - return { - id: state.data.id, - name: state.data.name, - busy: state.data.busy, - status: state.data.status, - os: state.data.os, - labels: state.data.labels, - runner_group_id: state.data.runner_group_id, - ephemeral: state.data.ephemeral, - }; + return state.data; } async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { @@ -221,22 +212,26 @@ async function markOrphan(instanceId: string): Promise { } } -async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise { +async function unmarkOrphan(instanceId: string): Promise { + try { + await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + logger.info(`Runner '${instanceId}' unmarked as orphan.`); + } catch (e) { + logger.error(`Failed to unmark runner '${instanceId}' as orphan.`, { error: e }); + } +} + +async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise { const client = await getOrCreateOctokit(runner as RunnerInfo); const runnerId = parseInt(runner.runnerId || '0'); const ec2Instance = runner as RunnerInfo; const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); + var isOrphan = false; logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); - if (state.status === 'online' && state.busy) { - logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); - await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); - } else if (state.status === 'offline') { - logger.warn(`Runner '${runner.instanceId}' is orphan.`); - logger.info(`Terminating orphan runner '${runner.instanceId}'`); - await terminateRunner(runner.instanceId).catch((e) => { - logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); - }); + if (state.status === 'offline') { + isOrphan = true; } + return isOrphan } async function terminateOrphan(environment: string): Promise { @@ -244,10 +239,8 @@ async function terminateOrphan(environment: string): Promise { const orphanRunners = await listEC2Runners({ environment, orphan: true }); for (const runner of orphanRunners) { - // do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method if (runner.runnerId) { - logger.debug(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); - await lastChanceCheckOrphanRunner(runner); + await lastChanceCheckOrphanRunner(runner) ? terminateRunner(runner.instanceId) : unmarkOrphan(runner.instanceId); } else { logger.info(`Terminating orphan runner '${runner.instanceId}'`); await terminateRunner(runner.instanceId).catch((e) => { From e4a4bfa58781e16275d19528805c29f744bfc6db Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 11:21:43 +0200 Subject: [PATCH 26/38] fix(lambda): replace ec2:RemoveTags with ec2:DeleteTags and tag:UntagResources in scale-down policy --- modules/runners/policies/lambda-scale-down.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/runners/policies/lambda-scale-down.json b/modules/runners/policies/lambda-scale-down.json index 549e397dea..ae082f1e3e 100644 --- a/modules/runners/policies/lambda-scale-down.json +++ b/modules/runners/policies/lambda-scale-down.json @@ -16,7 +16,8 @@ "Action": [ "ec2:TerminateInstances", "ec2:CreateTags", - "ec2:RemoveTags" + "ec2:DeleteTags", + "tag:UntagResources" ], "Resource": [ "*" @@ -32,7 +33,8 @@ "Action": [ "ec2:TerminateInstances", "ec2:CreateTags", - "ec2:RemoveTags" + "ec2:DeleteTags", + "tag:UntagResources" ], "Resource": [ "*" From f5a0388ecab22334883a27ced02f4f3908ba3751 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 14:10:19 +0200 Subject: [PATCH 27/38] fix: update logging messages for orphan runner tagging and state checks --- .../src/scale-runners/scale-down.ts | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index da31ff9ec6..ffe441324f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -206,18 +206,18 @@ async function evaluateAndRemoveRunners( async function markOrphan(instanceId: string): Promise { try { await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); - logger.info(`Runner '${instanceId}' marked as orphan.`); + logger.info(`Runner '${instanceId}' tagged as orphan.`); } catch (e) { - logger.error(`Failed to mark runner '${instanceId}' as orphan.`, { error: e }); + logger.error(`Failed to tag runner '${instanceId}' as orphan.`, { error: e }); } } async function unmarkOrphan(instanceId: string): Promise { try { await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); - logger.info(`Runner '${instanceId}' unmarked as orphan.`); + logger.info(`Runner '${instanceId}' untagged as orphan.`); } catch (e) { - logger.error(`Failed to unmark runner '${instanceId}' as orphan.`, { error: e }); + logger.error(`Failed to un-tag runner '${instanceId}' as orphan.`, { error: e }); } } @@ -227,11 +227,15 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise const ec2Instance = runner as RunnerInfo; const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); var isOrphan = false; - logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); - if (state.status === 'offline') { + logger.debug( + `Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`, + ); + const isOfflineAndBusy = state.status === 'offline' && state.busy; + if (isOfflineAndBusy) { isOrphan = true; } - return isOrphan + logger.info(`Runner '${runner.instanceId}' ${isOrphan ? 'is judged to be' : 'is judged to not be'} orphaned.`); + return isOrphan; } async function terminateOrphan(environment: string): Promise { @@ -240,7 +244,9 @@ async function terminateOrphan(environment: string): Promise { for (const runner of orphanRunners) { if (runner.runnerId) { - await lastChanceCheckOrphanRunner(runner) ? terminateRunner(runner.instanceId) : unmarkOrphan(runner.instanceId); + (await lastChanceCheckOrphanRunner(runner)) + ? terminateRunner(runner.instanceId) + : unmarkOrphan(runner.instanceId); } else { logger.info(`Terminating orphan runner '${runner.instanceId}'`); await terminateRunner(runner.instanceId).catch((e) => { From a245b2ba4c4deb1325fa28960dfd5ccf54aa24b6 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 15:09:28 +0200 Subject: [PATCH 28/38] fix: improve logging message for orphan runner judgment --- .../functions/control-plane/src/scale-runners/scale-down.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index ffe441324f..c2929f7493 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -230,11 +230,11 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise logger.debug( `Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`, ); - const isOfflineAndBusy = state.status === 'offline' && state.busy; + const isOfflineAndBusy = state.status === 'offline' && (state.busy || !state.busy); if (isOfflineAndBusy) { isOrphan = true; } - logger.info(`Runner '${runner.instanceId}' ${isOrphan ? 'is judged to be' : 'is judged to not be'} orphaned.`); + logger.info(`Runner is judged to '${runner.instanceId}' ${isOrphan ? 'be' : 'not be'} orphaned.`); return isOrphan; } From 6dca64f3e80dca862ded0a7b6259b289185c4d5c Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 15:16:27 +0200 Subject: [PATCH 29/38] fix: change var to let for isOrphan in lastChanceCheckOrphanRunner function --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index c2929f7493..b8d0faa09a 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -226,7 +226,7 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise const runnerId = parseInt(runner.runnerId || '0'); const ec2Instance = runner as RunnerInfo; const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); - var isOrphan = false; + let isOrphan = false; logger.debug( `Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`, ); From c335fc76630fc0a693020dbfcd58360d1a0a64dc Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 15:21:39 +0200 Subject: [PATCH 30/38] fix: refactor orphan termination logic for clarity and consistency --- .../control-plane/src/scale-runners/scale-down.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index b8d0faa09a..91182e42c6 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -244,9 +244,12 @@ async function terminateOrphan(environment: string): Promise { for (const runner of orphanRunners) { if (runner.runnerId) { - (await lastChanceCheckOrphanRunner(runner)) - ? terminateRunner(runner.instanceId) - : unmarkOrphan(runner.instanceId); + const isOrphan = await lastChanceCheckOrphanRunner(runner); + if (isOrphan) { + await terminateRunner(runner.instanceId); + } else { + await unmarkOrphan(runner.instanceId); + } } else { logger.info(`Terminating orphan runner '${runner.instanceId}'`); await terminateRunner(runner.instanceId).catch((e) => { From 1baf4afc238eb0bc26797c507d10cb5c8320d578 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 15:36:18 +0200 Subject: [PATCH 31/38] fix: rename addGhRunnerIdToEC2InstanceTag to tagRunnerId for clarity --- lambdas/functions/control-plane/src/scale-runners/scale-up.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 62da8b79b8..638edd3232 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -416,7 +416,7 @@ async function createRegistrationTokenConfig( } } -async function addGhRunnerIdToEC2InstanceTag(instanceId: string, runnerId: string): Promise { +async function tagRunnerId(instanceId: string, runnerId: string): Promise { try { await tag(instanceId, [{ Key: 'ghr:github_runner_id', Value: runnerId }]); } catch (e) { @@ -458,7 +458,7 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins metricGitHubAppRateLimit(runnerConfig.headers); // tag the EC2 instance with the Github runner id - await addGhRunnerIdToEC2InstanceTag(instance, runnerConfig.data.runner.id.toString()); + await tagRunnerId(instance, runnerConfig.data.runner.id.toString()); // store jit config in ssm parameter store logger.debug('Runner JIT config for ephemeral runner generated.', { From c2b103375f35edcec2c6c831673d8da39c88d0a2 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:01:21 +0200 Subject: [PATCH 32/38] fix: improve logging message format for orphan runner judgment --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 91182e42c6..f8a4415b0f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -234,7 +234,7 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise if (isOfflineAndBusy) { isOrphan = true; } - logger.info(`Runner is judged to '${runner.instanceId}' ${isOrphan ? 'be' : 'not be'} orphaned.`); + logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`); return isOrphan; } From 36176085bf3eee3d5be148a09cc4cbccb702dc13 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:03:14 +0200 Subject: [PATCH 33/38] fix: enhance orphan runner judgment logic for clarity --- .../functions/control-plane/src/scale-runners/scale-down.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index f8a4415b0f..5c7ddb287c 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -230,8 +230,9 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise logger.debug( `Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`, ); - const isOfflineAndBusy = state.status === 'offline' && (state.busy || !state.busy); - if (isOfflineAndBusy) { + const isOfflineAndBusy = state.status === 'offline' && state.busy; + const isOfflineAndIdle = state.status === 'offline' && !state.busy; + if (isOfflineAndBusy || isOfflineAndIdle) { isOrphan = true; } logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`); From b8af3f0236d14c07cd1cbd5a626f85f8b742a52e Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:08:01 +0200 Subject: [PATCH 34/38] fix: simplify orphan runner judgment logic by removing checks that infringe on other checks --- .../functions/control-plane/src/scale-runners/scale-down.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 5c7ddb287c..0549d935a4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -231,8 +231,7 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise `Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`, ); const isOfflineAndBusy = state.status === 'offline' && state.busy; - const isOfflineAndIdle = state.status === 'offline' && !state.busy; - if (isOfflineAndBusy || isOfflineAndIdle) { + if (isOfflineAndBusy) { isOrphan = true; } logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`); From 835ca85c9e8ada0906f6e3c7d11fc96576379e3b Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 21 Jul 2025 17:48:44 +0200 Subject: [PATCH 35/38] fix(scale-down): rename unmarkOrphan function to unMarkOrphan for consistency --- .../functions/control-plane/src/scale-runners/scale-down.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 0549d935a4..8f5cbd42d4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -212,7 +212,7 @@ async function markOrphan(instanceId: string): Promise { } } -async function unmarkOrphan(instanceId: string): Promise { +async function unMarkOrphan(instanceId: string): Promise { try { await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); logger.info(`Runner '${instanceId}' untagged as orphan.`); @@ -248,7 +248,7 @@ async function terminateOrphan(environment: string): Promise { if (isOrphan) { await terminateRunner(runner.instanceId); } else { - await unmarkOrphan(runner.instanceId); + await unMarkOrphan(runner.instanceId); } } else { logger.info(`Terminating orphan runner '${runner.instanceId}'`); From 3551c779189a0a8ab94beb873e95790b6c3e21b0 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 21 Jul 2025 18:17:48 +0200 Subject: [PATCH 36/38] fix(lambda): remove unused tag:UntagResources action from policy --- modules/runners/policies/lambda-scale-down.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/modules/runners/policies/lambda-scale-down.json b/modules/runners/policies/lambda-scale-down.json index ae082f1e3e..d35be746b7 100644 --- a/modules/runners/policies/lambda-scale-down.json +++ b/modules/runners/policies/lambda-scale-down.json @@ -16,8 +16,7 @@ "Action": [ "ec2:TerminateInstances", "ec2:CreateTags", - "ec2:DeleteTags", - "tag:UntagResources" + "ec2:DeleteTags" ], "Resource": [ "*" @@ -33,8 +32,7 @@ "Action": [ "ec2:TerminateInstances", "ec2:CreateTags", - "ec2:DeleteTags", - "tag:UntagResources" + "ec2:DeleteTags" ], "Resource": [ "*" From 441c4f445f2e1adbcdd9190d3d9a415ca527ab23 Mon Sep 17 00:00:00 2001 From: Stuart Pearson <1926002+stuartp44@users.noreply.github.com> Date: Mon, 21 Jul 2025 18:22:22 +0200 Subject: [PATCH 37/38] fix(runner): update orphan tag value to 'true' in untag runner test --- lambdas/functions/control-plane/src/aws/runners.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 9d218605bb..c4ec328c9b 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -284,7 +284,7 @@ describe('untag runner', () => { owner: 'owner-2', type: 'Repo', }; - //await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: '' }]); + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { Resources: [runner.instanceId], Tags: [{ Key: 'ghr:orphan', Value: 'true' }], From 9aa3203e9ee98130d1db23199a3543f4703909b4 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Mon, 21 Jul 2025 21:08:16 +0200 Subject: [PATCH 38/38] fix: Tag non jit instances with runner id and add docs (#4667) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ⚠️ This PR is on top of #4539 This pull request is an enhancement on top of #4539. The PR also introduces tagging via the start scripts for non-JIT configured instances. And it adds a state diagram to the ever growing complexity of the state diagram below terminating instances. ### Instance Tagging Enhancements: * Add function to tag runners with runner id for non JIT instances. Functions are designed to ignore errors to avoid causing a crash of the runner registration process. * side effect is that the instance is allowed to create the tag `ghr:github_runner_id` the instance is allowed to create the tag only on itself. ### Added docs * State diagram for scale-down added Untitled diagram _ Mermaid
Chart-2025-07-20-140732 --------- Co-authored-by: github-aws-runners-pr|bot --- mkdocs.yaml | 5 + modules/runners/README.md | 3 + modules/runners/policies-runner.tf | 6 + .../policies/instance-create-tags-policy.json | 20 +++ modules/runners/scale-down-state-diagram.md | 150 ++++++++++++++++++ modules/runners/templates/start-runner.ps1 | 41 +++++ modules/runners/templates/start-runner.sh | 33 ++++ 7 files changed, 258 insertions(+) create mode 100644 modules/runners/policies/instance-create-tags-policy.json create mode 100644 modules/runners/scale-down-state-diagram.md diff --git a/mkdocs.yaml b/mkdocs.yaml index d4bb359f42..9b98e84a36 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -46,6 +46,11 @@ markdown_extensions: - admonition - pymdownx.details - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format nav: - Introduction: index.md diff --git a/modules/runners/README.md b/modules/runners/README.md index 397236881d..34ebb61694 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -18,6 +18,8 @@ The scale up lambda is triggered by events on a SQS queue. Events on this queue The scale down lambda is triggered via a CloudWatch event. The event is triggered by a cron expression defined in the variable `scale_down_schedule_expression` (https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html). For scaling down GitHub does not provide a good API yet, therefore we run the scaling down based on this event every x minutes. Each time the lambda is triggered it tries to remove all runners older than x minutes (configurable) managed in this deployment. In case the runner can be removed from GitHub, which means it is not executing a workflow, the lambda will terminate the EC2 instance. +--8<-- "modules/runners/scale-down-state-diagram.md:mkdocs_scale_down_state_diagram" + ## Lambda Function The Lambda function is written in [TypeScript](https://www.typescriptlang.org/) and requires Node 12.x and yarn. Sources are located in [./lambdas/runners]. Two lambda functions share the same sources, there is one entry point for `scaleDown` and another one for `scaleUp`. @@ -85,6 +87,7 @@ yarn run dist | [aws_iam_role.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.create_tag](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.describe_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.dist_bucket](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | diff --git a/modules/runners/policies-runner.tf b/modules/runners/policies-runner.tf index d1b9190930..d923c143cb 100644 --- a/modules/runners/policies-runner.tf +++ b/modules/runners/policies-runner.tf @@ -57,6 +57,12 @@ resource "aws_iam_role_policy" "describe_tags" { policy = file("${path.module}/policies/instance-describe-tags-policy.json") } +resource "aws_iam_role_policy" "create_tag" { + name = "runner-create-tags" + role = aws_iam_role.runner.name + policy = templatefile("${path.module}/policies/instance-create-tags-policy.json", {}) +} + resource "aws_iam_role_policy_attachment" "managed_policies" { count = length(var.runner_iam_role_managed_policy_arns) role = aws_iam_role.runner.name diff --git a/modules/runners/policies/instance-create-tags-policy.json b/modules/runners/policies/instance-create-tags-policy.json new file mode 100644 index 0000000000..9da09fcb70 --- /dev/null +++ b/modules/runners/policies/instance-create-tags-policy.json @@ -0,0 +1,20 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": "ec2:CreateTags", + "Condition": { + "ForAllValues:StringEquals": { + "aws:TagKeys": [ + "ghr:github_runner_id" + ] + }, + "StringEquals": { + "aws:ARN": "$${ec2:SourceInstanceARN}" + } + }, + "Effect": "Allow", + "Resource": "arn:*:ec2:*:*:instance/*" + } + ] +} diff --git a/modules/runners/scale-down-state-diagram.md b/modules/runners/scale-down-state-diagram.md new file mode 100644 index 0000000000..b4f260eb2a --- /dev/null +++ b/modules/runners/scale-down-state-diagram.md @@ -0,0 +1,150 @@ +# GitHub Actions Runner Scale-Down State Diagram + + + +The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It performs a two-phase cleanup process: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances. + +```mermaid +stateDiagram-v2 + [*] --> ScheduledExecution : Cron Trigger every 5 min + + ScheduledExecution --> Phase1_OrphanTermination : Start Phase 1 + + state Phase1_OrphanTermination { + [*] --> ListOrphanInstances : Query EC2 for ghr orphan true + + ListOrphanInstances --> CheckOrphanType : For each orphan + + state CheckOrphanType <> + CheckOrphanType --> HasRunnerIdTag : Has ghr github runner id + CheckOrphanType --> TerminateOrphan : No runner ID tag + + HasRunnerIdTag --> LastChanceCheck : Query GitHub API + + state LastChanceCheck <> + LastChanceCheck --> ConfirmedOrphan : Offline and busy + LastChanceCheck --> FalsePositive : Exists and not problematic + + ConfirmedOrphan --> TerminateOrphan + FalsePositive --> RemoveOrphanTag + + TerminateOrphan --> NextOrphan : Continue processing + RemoveOrphanTag --> NextOrphan + + NextOrphan --> CheckOrphanType : More orphans? + NextOrphan --> Phase2_ActiveRunners : All processed + } + + Phase1_OrphanTermination --> Phase2_ActiveRunners : Phase 1 Complete + + state Phase2_ActiveRunners { + [*] --> ListActiveRunners : Query non-orphan EC2 instances + + ListActiveRunners --> GroupByOwner : Sort by owner and repo + + GroupByOwner --> ProcessOwnerGroup : For each owner + + state ProcessOwnerGroup { + [*] --> SortByStrategy : Apply eviction strategy + SortByStrategy --> ProcessRunner : Oldest first or newest first + + ProcessRunner --> QueryGitHub : Get GitHub runners for owner + + QueryGitHub --> MatchRunner : Find runner by instance ID suffix + + state MatchRunner <> + MatchRunner --> FoundInGitHub : Runner exists in GitHub + MatchRunner --> NotFoundInGitHub : Runner not in GitHub + + state FoundInGitHub { + [*] --> CheckMinimumTime : Has minimum runtime passed? + + state CheckMinimumTime <> + CheckMinimumTime --> TooYoung : Runtime less than minimum + CheckMinimumTime --> CheckIdleQuota : Runtime greater than or equal to minimum + + TooYoung --> NextRunner + + state CheckIdleQuota <> + CheckIdleQuota --> KeepIdle : Idle quota available + CheckIdleQuota --> CheckBusyState : Quota full + + KeepIdle --> NextRunner + + state CheckBusyState <> + CheckBusyState --> KeepBusy : Runner busy + CheckBusyState --> TerminateIdle : Runner idle + + KeepBusy --> NextRunner + TerminateIdle --> DeregisterFromGitHub + DeregisterFromGitHub --> TerminateInstance + TerminateInstance --> NextRunner + } + + state NotFoundInGitHub { + [*] --> CheckBootTime : Has boot time exceeded? + + state CheckBootTime <> + CheckBootTime --> StillBooting : Boot time less than threshold + CheckBootTime --> MarkOrphan : Boot time greater than or equal to threshold + + StillBooting --> NextRunner + MarkOrphan --> TagAsOrphan : Set ghr orphan true + TagAsOrphan --> NextRunner + } + + NextRunner --> ProcessRunner : More runners in group? + NextRunner --> NextOwnerGroup : Group complete + } + + NextOwnerGroup --> ProcessOwnerGroup : More owner groups? + NextOwnerGroup --> ExecutionComplete : All groups processed + } + + Phase2_ActiveRunners --> ExecutionComplete : Phase 2 Complete + + ExecutionComplete --> [*] : Wait for next cron trigger + + note right of LastChanceCheck + Uses ghr github runner id tag + for precise GitHub API lookup + end note + + note right of MatchRunner + Matches GitHub runner name + ending with EC2 instance ID + end note + + note right of CheckMinimumTime + Minimum running time in minutes + (Linux: 5min, Windows: 15min) + end note + + note right of CheckBootTime + Runner boot time in minutes + Default configuration value + end note +``` + + + +## Key Decision Points + +| State | Condition | Action | +|-------|-----------|--------| +| **Orphan w/ Runner ID** | GitHub: offline + busy | Terminate (confirmed orphan) | +| **Orphan w/ Runner ID** | GitHub: exists + healthy | Remove orphan tag (false positive) | +| **Orphan w/o Runner ID** | Always | Terminate (no way to verify) | +| **Active Runner Found** | Runtime < minimum | Keep (too young) | +| **Active Runner Found** | Idle quota available | Keep as idle | +| **Active Runner Found** | Quota full + idle | Terminate + deregister | +| **Active Runner Found** | Quota full + busy | Keep running | +| **Active Runner Missing** | Boot time exceeded | Mark as orphan | +| **Active Runner Missing** | Still booting | Wait | + +## Configuration Parameters + +- **Cron Schedule**: `cron(*/5 * * * ? *)` (every 5 minutes) +- **Minimum Runtime**: Linux 5min, Windows 15min +- **Boot Timeout**: Configurable via `runner_boot_time_in_minutes` +- **Idle Config**: Per-environment configuration for desired idle runners diff --git a/modules/runners/templates/start-runner.ps1 b/modules/runners/templates/start-runner.ps1 index 1ced28dcba..ae2eeff3c9 100644 --- a/modules/runners/templates/start-runner.ps1 +++ b/modules/runners/templates/start-runner.ps1 @@ -1,6 +1,44 @@ ## Retrieve instance metadata +function Tag-InstanceWithRunnerId { + Write-Host "Checking for .runner file to extract agent ID" + + $runnerFilePath = "$pwd\.runner" + if (-not (Test-Path $runnerFilePath)) { + Write-Host "Warning: .runner file not found" + return $true + } + + Write-Host "Found .runner file, extracting agent ID" + try { + $runnerConfig = Get-Content $runnerFilePath | ConvertFrom-Json + $agentId = $runnerConfig.agentId + + if (-not $agentId -or $agentId -eq $null) { + Write-Host "Warning: Could not extract agent ID from .runner file" + return $true + } + + Write-Host "Tagging instance with GitHub runner agent ID: $agentId" + $tagResult = aws ec2 create-tags --region "$Region" --resources "$InstanceId" --tags "Key=ghr:github_runner_id,Value=$agentId" 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host "Successfully tagged instance with agent ID: $agentId" + return $true + } else { + Write-Host "Warning: Failed to tag instance with agent ID - $tagResult" + return $true + } + } + catch { + Write-Host "Warning: Error processing .runner file - $($_.Exception.Message)" + return $true + } +} + +## Retrieve instance metadata + Write-Host "Retrieving TOKEN from AWS API" $token=Invoke-RestMethod -Method PUT -Uri "http://169.254.169.254/latest/api/token" -Headers @{"X-aws-ec2-metadata-token-ttl-seconds" = "180"} if ( ! $token ) { @@ -122,6 +160,9 @@ if ($enable_jit_config -eq "false" -or $agent_mode -ne "ephemeral") { $configCmd = ".\config.cmd --unattended --name $runner_name_prefix$InstanceId --work `"_work`" $runnerExtraOptions $config" Write-Host "Configure GH Runner (non ephmeral / no JIT) as user $run_as" Invoke-Expression $configCmd + + # Tag instance with GitHub runner agent ID for non-JIT runners + Tag-InstanceWithRunnerId } $jsonBody = @( diff --git a/modules/runners/templates/start-runner.sh b/modules/runners/templates/start-runner.sh index 1c1f3d5e9f..7f2c0f82c5 100644 --- a/modules/runners/templates/start-runner.sh +++ b/modules/runners/templates/start-runner.sh @@ -58,6 +58,36 @@ create_xray_error_segment() { echo "$SEGMENT_DOC" } +tag_instance_with_runner_id() { + echo "Checking for .runner file to extract agent ID" + + if [[ ! -f "/opt/actions-runner/.runner" ]]; then + echo "Warning: .runner file not found" + return 0 + fi + + echo "Found .runner file, extracting agent ID" + local agent_id + agent_id=$(jq -r '.agentId' /opt/actions-runner/.runner 2>/dev/null || echo "") + + if [[ -z "$agent_id" || "$agent_id" == "null" ]]; then + echo "Warning: Could not extract agent ID from .runner file" + return 0 + fi + + echo "Tagging instance with GitHub runner agent ID: $agent_id" + if aws ec2 create-tags \ + --region "$region" \ + --resources "$instance_id" \ + --tags Key=ghr:github_runner_id,Value="$agent_id"; then + echo "Successfully tagged instance with agent ID: $agent_id" + return 0 + else + echo "Warning: Failed to tag instance with agent ID" + return 0 + fi +} + cleanup() { local exit_code="$1" local error_location="$2" @@ -225,6 +255,9 @@ if [[ "$enable_jit_config" == "false" || $agent_mode != "ephemeral" ]]; then extra_flags="" fi sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./config.sh $${extra_flags} --unattended --name "$runner_name_prefix$instance_id" --work "_work" $${config} + + # Tag instance with GitHub runner agent ID for non-JIT runners + tag_instance_with_runner_id fi create_xray_success_segment "$SEGMENT"