From a0f7e5c51ccc626c2a9bc47f1ed3df612e693db6 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 17 Jul 2024 09:12:17 +0200 Subject: [PATCH 1/8] feat: terminate orphan runners in the second scale down cycle --- .../control-plane/src/aws/runners.d.ts | 1 + .../control-plane/src/aws/runners.test.ts | 23 ++++++++++++++++++- .../control-plane/src/aws/runners.ts | 11 +++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index ea1f86b735..a3741b4f9d 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -22,6 +22,7 @@ export interface ListRunnerFilters { runnerType?: RunnerType; runnerOwner?: string; environment?: string; + orphan?: boolean; statuses?: string[]; } diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index d233b16a69..3636c868d9 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -3,6 +3,7 @@ import { CreateFleetCommandInput, CreateFleetInstance, CreateFleetResult, + CreateTagsCommand, DefaultTargetCapacityType, DescribeInstancesCommand, DescribeInstancesResult, @@ -16,7 +17,7 @@ import { mockClient } from 'aws-sdk-client-mock'; import 'aws-sdk-client-mock-jest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, terminateRunner } from './runners'; +import { createRunner, listEC2Runners, tag, terminateRunner } from './runners'; import { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; process.env.AWS_REGION = 'eu-east-1'; @@ -182,6 +183,26 @@ describe('terminate runner', () => { }); }); +describe('tag runner', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + it('adding extra tag', async () => { + mockEC2Client.on(CreateTagsCommand).resolves({}); + const runner: RunnerInfo = { + instanceId: 'instance-2', + owner: 'owner-2', + type: 'Repo', + }; + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'truer' }]); + + expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'truer' }], + }); + }); +}); + describe('create runner', () => { const defaultRunnerConfig: RunnerConfig = { allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED, diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 889d3c5f8a..14bbd5b69e 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -1,10 +1,12 @@ import { CreateFleetCommand, CreateFleetResult, + CreateTagsCommand, DescribeInstancesCommand, DescribeInstancesResult, EC2Client, FleetLaunchTemplateOverridesRequest, + Tag, TerminateInstancesCommand, _InstanceType, } from '@aws-sdk/client-ec2'; @@ -46,6 +48,9 @@ function constructFilters(filters?: Runners.ListRunnerFilters): Ec2Filter[][] { ec2FiltersBase.push({ Name: `tag:ghr:Type`, Values: [filters.runnerType] }); ec2FiltersBase.push({ Name: `tag:ghr:Owner`, Values: [filters.runnerOwner] }); } + if (filters.orphan) { + ec2FiltersBase.push({ Name: 'tag:ghr:orphan', Values: ['true'] }); + } } for (const key of ['tag:ghr:Application']) { @@ -100,6 +105,12 @@ export async function terminateRunner(instanceId: string): Promise { logger.info(`Runner ${instanceId} has been terminated.`); } +export async function tag(instanceId: string, tags: Tag[]): Promise { + logger.info(`Tagging '${instanceId}'`, { tags }); + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); +} + function generateFleetOverrides( subnetIds: string[], instancesTypes: string[], From 4d2b910c48ed516a576339df52d09c473e5d21ca Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 17 Jul 2024 17:37:53 +0200 Subject: [PATCH 2/8] feat: mark orphan runners before removing them --- .../functions/control-plane/jest.config.ts | 8 +- .../control-plane/src/aws/runners.d.ts | 1 + .../control-plane/src/aws/runners.test.ts | 34 ++++++ .../control-plane/src/aws/runners.ts | 1 + .../src/scale-runners/scale-down.test.ts | 108 ++++++++++++++---- .../src/scale-runners/scale-down.ts | 34 ++++-- 6 files changed, 154 insertions(+), 32 deletions(-) diff --git a/lambdas/functions/control-plane/jest.config.ts b/lambdas/functions/control-plane/jest.config.ts index e96f2cbc3d..74cc3ff598 100644 --- a/lambdas/functions/control-plane/jest.config.ts +++ b/lambdas/functions/control-plane/jest.config.ts @@ -6,10 +6,10 @@ const config: Config = { ...defaultConfig, coverageThreshold: { global: { - statements: 97.79, - branches: 96.13, - functions: 95.4, - lines: 98.06, + statements: 97.99, + branches: 96.04, + functions: 97.53, + lines: 98.27, }, }, }; diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index a3741b4f9d..3a1b31b1cf 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -9,6 +9,7 @@ export interface RunnerList { type?: string; repo?: string; org?: string; + orphan?: boolean; } export interface RunnerInfo { diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 3636c868d9..09a0e8b710 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -68,6 +68,23 @@ describe('list instances', () => { launchTime: new Date('2020-10-10T14:48:00.000+09:00'), type: 'Org', owner: 'CoderToCat', + orphan: false, + }); + }); + + it('check orphan tag.', async () => { + const instances: DescribeInstancesResult = mockRunningInstances; + instances.Reservations![0].Instances![0].Tags!.push({ Key: 'ghr:orphan', Value: 'true' }); + mockEC2Client.on(DescribeInstancesCommand).resolves(instances); + + const resp = await listEC2Runners(); + expect(resp.length).toBe(1); + expect(resp).toContainEqual({ + instanceId: instances.Reservations![0].Instances![0].InstanceId!, + launchTime: instances.Reservations![0].Instances![0].LaunchTime!, + type: 'Org', + owner: 'CoderToCat', + orphan: true, }); }); @@ -115,6 +132,23 @@ describe('list instances', () => { }); }); + it('filters instances on environment and orphan', async () => { + mockRunningInstances.Reservations![0].Instances![0].Tags!.push({ + Key: 'ghr:orphan', + Value: 'true', + }); + mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances); + await listEC2Runners({ environment: ENVIRONMENT, orphan: true }); + expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, { + Filters: [ + { Name: 'instance-state-name', Values: ['running', 'pending'] }, + { Name: 'tag:ghr:environment', Values: [ENVIRONMENT] }, + { Name: 'tag:ghr:orphan', Values: ['true'] }, + { Name: 'tag:ghr:Application', Values: ['github-action-runner'] }, + ], + }); + }); + it('No instances, undefined reservations list.', async () => { const noInstances: DescribeInstancesResult = { Reservations: undefined, diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 14bbd5b69e..eb4bf53ef6 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -90,6 +90,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { type: i.Tags?.find((e) => e.Key === 'ghr:Type')?.Value as string, repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, + orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', }); } } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index b3c7de9072..5c008dec88 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -5,7 +5,7 @@ import nock from 'nock'; import { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../gh-auth/gh-auth'; -import { listEC2Runners, terminateRunner } from './../aws/runners'; +import { listEC2Runners, terminateRunner, tag } from './../aws/runners'; import { githubCache } from './cache'; import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; @@ -42,6 +42,7 @@ const mockedAppAuth = mocked(ghAuth.createGithubAppAuth, { shallow: false }); const mockedInstallationAuth = mocked(ghAuth.createGithubInstallationAuth, { shallow: false }); const mockCreateClient = mocked(ghAuth.createOctoClient, { shallow: false }); const mockListRunners = mocked(listEC2Runners); +const mockTagRunners = mocked(tag); const mockTerminateRunners = mocked(terminateRunner); export interface TestData { @@ -172,7 +173,7 @@ describe('Scale down runners', () => { describe.each(runnerTypes)('For %s runners.', (type) => { it('Should not call terminate when no runners online.', async () => { // setup - mockListRunners.mockResolvedValue([]); + mockAwsRunners([]); // act await scaleDown(); @@ -197,7 +198,8 @@ describe('Scale down runners', () => { mockGitHubRunners(runners); mockListRunners.mockResolvedValue(runners); - // act + mockAwsRunners(runners); + await scaleDown(); // assert @@ -220,7 +222,7 @@ describe('Scale down runners', () => { const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES - 1, true, false, false)]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -235,7 +237,7 @@ describe('Scale down runners', () => { const runners = [createRunnerTestData('booting-1', type, MINIMUM_BOOT_TIME - 1, false, false, false)]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -250,7 +252,7 @@ describe('Scale down runners', () => { const runners = [createRunnerTestData('busy-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -274,7 +276,7 @@ describe('Scale down runners', () => { ]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => { return { status: 500 }; }); @@ -293,10 +295,31 @@ describe('Scale down runners', () => { it(`Should terminate orphan.`, async () => { // setup - const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); + const idleRunner = createRunnerTestData('idle-1', type, MINIMUM_BOOT_TIME + 1, true, false, false); + const runners = [orphanRunner, idleRunner]; - mockGitHubRunners([]); - mockListRunners.mockResolvedValue(runners); + mockGitHubRunners([idleRunner]); + mockAwsRunners(runners); + + // act + await scaleDown(); + + // assert + checkTerminated(runners); + checkNonTerminated(runners); + + expect(mockTagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [ + { + Key: 'orphan', + Value: 'true', + }, + ]); + expect(mockTagRunners).not.toHaveBeenCalledWith(idleRunner.instanceId, expect.anything()); + + // next cycle, update test data set orphan to true and terminate should be true + orphanRunner.orphan = true; + orphanRunner.shouldBeTerminated = true; // act await scaleDown(); @@ -306,21 +329,60 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it(`Should orphan termination failure is not resulting in an exception..`, async () => { + it(`Should ignore errors when termination orphan fails.`, async () => { // setup - const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true); + const runners = [orphanRunner]; mockGitHubRunners([]); - mockListRunners.mockResolvedValue(runners); - mockTerminateRunners.mockRejectedValue(new Error('Termination failed')); + mockAwsRunners(runners); + mockTerminateRunners.mockImplementation(() => { + throw new Error('Failed to terminate'); + }); - // act and ensure no exception is thrown - await expect(scaleDown()).resolves.not.toThrow(); + // act + await scaleDown(); + + // assert + checkTerminated(runners); + checkNonTerminated(runners); + }); + + describe('When orphan termination fails', () => { + it(`Should not throw in case of list runner exception.`, async () => { + // setup + const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + + mockGitHubRunners([]); + mockListRunners.mockRejectedValueOnce(new Error('Failed to list runners')); + mockAwsRunners(runners); + + // ac + await scaleDown(); + + // assert + checkNonTerminated(runners); + }); + + it(`Should not throw in case of terminate runner exception.`, async () => { + // setup + const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + + mockGitHubRunners([]); + mockAwsRunners(runners); + mockTerminateRunners.mockRejectedValue(new Error('Failed to terminate')); + + // act and ensure no exception is thrown + await scaleDown(); + + // assert + checkNonTerminated(runners); + }); }); it(`Should not terminate instance in case de-register fails.`, async () => { // setup - const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, true, false)]; + const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => { return { status: 500 }; @@ -330,7 +392,7 @@ describe('Scale down runners', () => { }); mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act and should resolve await expect(scaleDown()).resolves.not.toThrow(); @@ -352,7 +414,7 @@ describe('Scale down runners', () => { }); mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await expect(scaleDown()).resolves.not.toThrow(); @@ -383,7 +445,7 @@ describe('Scale down runners', () => { ]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -502,6 +564,12 @@ describe('Scale down runners', () => { }); }); +function mockAwsRunners(runners: RunnerTestItem[]) { + mockListRunners.mockImplementation(async (filter) => { + return runners.filter((r) => !filter?.orphan || filter?.orphan === r.orphan); + }); +} + function checkNonTerminated(runners: RunnerTestItem[]) { const notTerminated = runners.filter((r) => !r.shouldBeTerminated); for (const toTerminate of notTerminated) { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 96b4dd4c76..375e8d5163 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -3,7 +3,7 @@ import { createChildLogger } from '@terraform-aws-github-runner/aws-powertools-u import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctoClient } from '../gh-auth/gh-auth'; -import { bootTimeExceeded, listEC2Runners, terminateRunner } from './../aws/runners'; +import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; @@ -184,8 +184,7 @@ async function evaluateAndRemoveRunners( } } else { if (bootTimeExceeded(ec2Runner)) { - logger.info(`Runner '${ec2Runner.instanceId}' is orphaned and will be removed.`); - terminateOrphan(ec2Runner.instanceId); + markOrphan(ec2Runner.instanceId); } else { logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); } @@ -194,11 +193,26 @@ async function evaluateAndRemoveRunners( } } -async function terminateOrphan(instanceId: string): Promise { +async function markOrphan(instanceId: string): Promise { try { - await terminateRunner(instanceId); + await tag(instanceId, [{ Key: 'orphan', Value: 'true' }]); + logger.info(`Runner '${instanceId}' marked as orphan.`); } catch (e) { - logger.debug(`Orphan runner '${instanceId}' cannot be removed.`); + logger.warn(`Orphan runner '${instanceId}' cannot be marked.`); + } +} + +async function terminateOrphan(environment: string): Promise { + try { + const orphanRunners = await listEC2Runners({ environment, orphan: true }); + + for (const runner of orphanRunners) { + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } + } catch (e) { + logger.warn(`Failure during orphan runner termination.`, { error: e }); } } @@ -221,14 +235,18 @@ async function listRunners(environment: string) { } function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] { - return ec2runners.filter((ec2Runner) => ec2Runner.type) as RunnerInfo[]; + return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[]; } export async function scaleDown(): Promise { githubCache.reset(); - const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig]; const environment = process.env.ENVIRONMENT; + const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig]; + + // first runners marked to be orphan. + await terminateOrphan(environment); + // next scale down idle runners with respect to config and mark potential orphans const ec2Runners = await listRunners(environment); const activeEc2RunnersCount = ec2Runners.length; logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`); From 3172cb2c017b6fbc62d763e1a02dd17542bef671 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 17 Jul 2024 17:43:35 +0200 Subject: [PATCH 3/8] set converage --- lambdas/functions/control-plane/jest.config.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lambdas/functions/control-plane/jest.config.ts b/lambdas/functions/control-plane/jest.config.ts index 74cc3ff598..dd8ac50414 100644 --- a/lambdas/functions/control-plane/jest.config.ts +++ b/lambdas/functions/control-plane/jest.config.ts @@ -6,10 +6,10 @@ const config: Config = { ...defaultConfig, coverageThreshold: { global: { - statements: 97.99, - branches: 96.04, - functions: 97.53, - lines: 98.27, + statements: 98.01, + branches: 97.28, + functions: 95.6, + lines: 97.94, }, }, }; From d045090569b6ae1dc4fcaa404b6e88c56c0c40fb Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Thu, 1 Aug 2024 10:13:09 +0200 Subject: [PATCH 4/8] add ghr prefix --- lambdas/functions/control-plane/package.json | 2 +- .../src/scale-runners/scale-down.test.ts | 2 +- .../control-plane/src/scale-runners/scale-down.ts | 12 +++++------- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/lambdas/functions/control-plane/package.json b/lambdas/functions/control-plane/package.json index 73cca23a32..14e9a1b6a6 100644 --- a/lambdas/functions/control-plane/package.json +++ b/lambdas/functions/control-plane/package.json @@ -8,7 +8,7 @@ "test": "NODE_ENV=test nx test", "test:watch": "NODE_ENV=test nx test --watch", "lint": "yarn eslint src", - "watch": "ts-node-dev --respawn --exit-child --files src/local.ts", + "watch": "ts-node-dev --respawn --exit-child --files src/local-down.ts", "build": "ncc build src/lambda.ts -o dist", "dist": "yarn build && cd dist && zip ../runners.zip index.js", "format": "prettier --write \"**/*.ts\"", diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 5c008dec88..925e4e4b10 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -311,7 +311,7 @@ describe('Scale down runners', () => { expect(mockTagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [ { - Key: 'orphan', + Key: 'ghr:orphan', Value: 'true', }, ]); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 375e8d5163..24663b264f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -182,12 +182,10 @@ async function evaluateAndRemoveRunners( ); } } + } else if (bootTimeExceeded(ec2Runner)) { + markOrphan(ec2Runner.instanceId); } else { - if (bootTimeExceeded(ec2Runner)) { - markOrphan(ec2Runner.instanceId); - } else { - logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); - } + logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); } } } @@ -195,10 +193,10 @@ async function evaluateAndRemoveRunners( async function markOrphan(instanceId: string): Promise { try { - await tag(instanceId, [{ Key: 'orphan', Value: 'true' }]); + await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); logger.info(`Runner '${instanceId}' marked as orphan.`); } catch (e) { - logger.warn(`Orphan runner '${instanceId}' cannot be marked.`); + logger.error(`Failed to mark runner '${instanceId}' as orphan.`, { error: e }); } } From 728ff39d2248f7bb0a5a10626758c6185f51689b Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Thu, 1 Aug 2024 15:05:51 +0200 Subject: [PATCH 5/8] clean logs --- lambdas/functions/control-plane/src/aws/runners.ts | 6 +++--- .../control-plane/src/scale-runners/scale-down.ts | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index eb4bf53ef6..346013b0d1 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -100,14 +100,14 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { } export async function terminateRunner(instanceId: string): Promise { - logger.info(`Runner '${instanceId}' will be terminated.`); + logger.debug(`Runner '${instanceId}' will be terminated.`); const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); await ec2.send(new TerminateInstancesCommand({ InstanceIds: [instanceId] })); - logger.info(`Runner ${instanceId} has been terminated.`); + logger.debug(`Runner ${instanceId} has been terminated.`); } export async function tag(instanceId: string, tags: Tag[]): Promise { - logger.info(`Tagging '${instanceId}'`, { tags }); + logger.debug(`Tagging '${instanceId}'`, { tags }); const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 24663b264f..eeef81d5d5 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -130,7 +130,7 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi if (statuses.every((status) => status == 204)) { await terminateRunner(ec2runner.instanceId); - logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); + logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { logger.error(`Failed to de-register GitHub runner: ${statuses}`); } @@ -175,7 +175,7 @@ async function evaluateAndRemoveRunners( idleCounter--; logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); } else { - logger.info(`Will try to terminate runners that are not busy`); + logger.info(`Terminating all non busy runners.`); await removeRunner( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), @@ -183,7 +183,7 @@ async function evaluateAndRemoveRunners( } } } else if (bootTimeExceeded(ec2Runner)) { - markOrphan(ec2Runner.instanceId); + await markOrphan(ec2Runner.instanceId); } else { logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); } @@ -205,6 +205,7 @@ async function terminateOrphan(environment: string): Promise { const orphanRunners = await listEC2Runners({ environment, orphan: true }); for (const runner of orphanRunners) { + logger.info(`Terminating orphan runner '${runner.instanceId}'`); await terminateRunner(runner.instanceId).catch((e) => { logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); }); From d524f32fc341db40c7432d5a6b88dfa1f2144abb Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Thu, 1 Aug 2024 15:47:55 +0200 Subject: [PATCH 6/8] IAM policy --- modules/runners/policies/lambda-scale-down.json | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/runners/policies/lambda-scale-down.json b/modules/runners/policies/lambda-scale-down.json index f745ba52bf..0f73aeacf1 100644 --- a/modules/runners/policies/lambda-scale-down.json +++ b/modules/runners/policies/lambda-scale-down.json @@ -14,7 +14,8 @@ { "Effect": "Allow", "Action": [ - "ec2:TerminateInstances" + "ec2:TerminateInstances", + "ec2:CreateTags" ], "Resource": [ "*" @@ -28,14 +29,15 @@ { "Effect": "Allow", "Action": [ - "ec2:TerminateInstances" + "ec2:TerminateInstances", + "ec2:CreateTags" ], "Resource": [ "*" ], "Condition": { "StringEquals": { - "ec2:ResourceTag/Application": "github-action-runner" + "ec2:ResourceTag/gh:environment": "${environment}" } } }, From 8a16834f8d8f84bac2d8328377b04485719ba50a Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Thu, 1 Aug 2024 15:48:08 +0200 Subject: [PATCH 7/8] IAM policy --- modules/runners/scale-down.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/runners/scale-down.tf b/modules/runners/scale-down.tf index c74f88e387..5fc9c02ee0 100644 --- a/modules/runners/scale-down.tf +++ b/modules/runners/scale-down.tf @@ -93,6 +93,7 @@ resource "aws_iam_role_policy" "scale_down" { name = "${var.prefix}-lambda-scale-down-policy" role = aws_iam_role.scale_down.name policy = templatefile("${path.module}/policies/lambda-scale-down.json", { + environment = var.prefix github_app_id_arn = var.github_app_parameters.id.arn github_app_key_base64_arn = var.github_app_parameters.key_base64.arn kms_key_arn = local.kms_key_arn From 847bd13e7e08de33b6cefa9ac3576157d66a3d29 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Thu, 1 Aug 2024 15:48:47 +0200 Subject: [PATCH 8/8] update docs --- docs/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 38d7d8cb95..77a248f737 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,7 +46,7 @@ The "Scale Up Runner" Lambda actively monitors the SQS queue, processing incomin The Lambda first requests a JIT configuration or registration token from GitHub, which is needed later by the runner to register itself. This avoids the case that the EC2 instance, which later in the process will install the agent, needs administration permissions to register the runner. Next, the EC2 spot instance is created via the launch template. The launch template defines the specifications of the required instance and contains a [`user_data`](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html) script. This script will install the required software and configure it. The configuration for the runner is shared via EC2 tags and the parameter store (SSM), from which the user data script will fetch it and delete it once it has been retrieved. Once the user data script is finished, the action runner should be online, and the workflow will start in seconds. -The current method for scaling down runners employs a straightforward approach: at predefined intervals, the Lambda conducts a thorough examination of each runner (instance) to assess its activity. If a runner is found to be idle, it is deregistered from GitHub, and the associated AWS instance is terminated. For ephemeral runners the the instance is terminated immediately after the workflow is finished. To avoid orphaned runners the scale down lambda is active in this cae as well. +The current method for scaling down runners employs a straightforward approach: at predefined intervals, the Lambda conducts a thorough examination of each runner (instance) to assess its activity. If a runner is found to be idle, it is deregistered from GitHub, and the associated AWS instance is terminated. For ephemeral runners the the instance is terminated immediately after the workflow is finished. Instances not registered in GitHub as a runner after a minimal boot time will be marked orphan and removed in a next cycle. To avoid orphaned runners the scale down lambda is active in this cae as well. ### Pool @@ -68,7 +68,7 @@ The AMI cleaner is a lambda that will clean up AMIs that are older than a config > This feature is Beta, changes will not trigger a major release as long in beta. -The Instance Termination Watcher is creating log and optional metrics for termination of instances. Currently only spot termination warnings are watched. See [configuration](configuration/) for more details. +The Instance Termination Watcher is creating log and optional metrics for termination of instances. Currently only spot termination warnings are watched. See [configuration](configuration/) for more details. ### Security