Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 74 additions & 49 deletions lambdas/functions/control-plane/src/aws/runners.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@ import {
CreateFleetCommand,
CreateFleetResult,
CreateTagsCommand,
DefaultTargetCapacityType,
DescribeInstancesCommand,
DescribeInstancesResult,
EC2Client,
FleetLaunchTemplateOverridesRequest,
Tag,
TerminateInstancesCommand,
StopInstancesCommand,
StartInstancesCommand,
_InstanceType,
Filter,
} from '@aws-sdk/client-ec2';
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
import { getTracedAWSV3Client, tracer } from '@aws-github-runner/aws-powertools-util';
Expand Down Expand Up @@ -167,15 +171,12 @@ async function processFleetResult(
);
const errors = fleet.Errors?.flatMap((e) => e.ErrorCode || '') || [];

// Educated guess of errors that would make sense to retry based on the list
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
const scaleErrors = [
'UnfulfillableCapacity',
'MaxSpotInstanceCountExceeded',
'TargetCapacityLimitExceededException',
'RequestLimitExceeded',
'ResourceLimitExceeded',
'MaxSpotInstanceCountExceeded',
'MaxSpotFleetRequestCountExceeded',
'InsufficientInstanceCapacity',
];
Expand All @@ -184,7 +185,7 @@ async function processFleetResult(
errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) &&
runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot'
) {
logger.warn(`Create fleet failed, initatiing fall back to on demand instances.`);
logger.warn(`Create fleet failed, initiating fall back to on demand instances.`);
logger.debug('Create fleet failed.', { data: fleet.Errors });
const numberOfInstances = runnerParameters.numberOfRunners - instances.length;
const instancesOnDemand = await createRunner({
Expand Down Expand Up @@ -218,12 +219,10 @@ async function getAmiIdOverride(runnerParameters: Runners.RunnerInputParameters)
return amiIdOverride;
} catch (e) {
logger.debug(
`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}. ` +
'Please ensure that the given parameter exists on this region and contains a valid runner AMI ID',
`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}.`,
{ error: e },
);
throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName},
${e}`);
throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}, ${e}`);
}
}

Expand All @@ -244,54 +243,80 @@ async function createInstances(
tags.push({ Key: 'ghr:trace_id', Value: traceId! });
}

let fleet: CreateFleetResult;
try {
// see for spec https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html
const createFleetCommand = new CreateFleetCommand({
LaunchTemplateConfigs: [
{
LaunchTemplateSpecification: {
LaunchTemplateName: runnerParameters.launchTemplateName,
Version: '$Default',
},
Overrides: generateFleetOverrides(
runnerParameters.subnets,
runnerParameters.ec2instanceCriteria.instanceTypes,
amiIdOverride,
),
const createFleetCommand = new CreateFleetCommand({
LaunchTemplateConfigs: [
{
LaunchTemplateSpecification: {
LaunchTemplateName: runnerParameters.launchTemplateName,
Version: '$Default',
},
],
SpotOptions: {
MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice,
AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy,
Overrides: generateFleetOverrides(
runnerParameters.subnets,
runnerParameters.ec2instanceCriteria.instanceTypes,
amiIdOverride,
),
},
TargetCapacitySpecification: {
TotalTargetCapacity: runnerParameters.numberOfRunners,
DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType,
],
SpotOptions: {
MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice,
AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy,
},
TargetCapacitySpecification: {
TotalTargetCapacity: runnerParameters.numberOfRunners,
DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType,
},
TagSpecifications: [
{
ResourceType: 'instance',
Tags: tags,
},
TagSpecifications: [
{
ResourceType: 'instance',
Tags: tags,
},
{
ResourceType: 'volume',
Tags: tags,
},
],
Type: 'instant',
});
fleet = await ec2Client.send(createFleetCommand);
} catch (e) {
logger.warn('Create fleet request failed.', { error: e as Error });
throw e;
}
return fleet;
{
ResourceType: 'volume',
Tags: tags,
},
],
Type: 'instant',
});
return await ec2Client.send(createFleetCommand);
}

// If launchTime is undefined, this will return false
export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean {
const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
return launchTimePlusBootTime < moment(new Date()).utc();
}

// New: Hibernate the runner instance
export async function hibernateRunner(instanceId: string): Promise<void> {
logger.debug(`Runner '${instanceId}' will be hibernated (stopped with hibernation).`);
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
// Note: Stopping an instance that supports hibernation and has hibernation configured will hibernate it.
await ec2.send(new StopInstancesCommand({ InstanceIds: [instanceId], Hibernate: true }));
await tag(instanceId, [{ Key: 'ghr:hibernated', Value: 'true' }]);
logger.debug(`Runner ${instanceId} has been hibernated.`);
}

// Helper function to resume hibernated instances
export async function resumeHibernatedInstances(count: number): Promise<string[]> {
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
const filters: Filter[] = [
{ Name: 'instance-state-name', Values: ['stopped'] },
{ Name: 'tag:ghr:hibernated', Values: ['true'] },
{ Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
];

const desc = await ec2.send(new DescribeInstancesCommand({ Filters: filters }));
const stoppedInstances = desc.Reservations?.flatMap(r => r.Instances?.map(i => i.InstanceId!) || []) || [];

const toResume = stoppedInstances.slice(0, count);
if (toResume.length > 0) {
logger.info(`Resuming hibernated instances: ${toResume.join(',')}`);
await ec2.send(new StartInstancesCommand({ InstanceIds: toResume }));
// Optionally remove hibernated tag after start
for (const instanceId of toResume) {
await tag(instanceId, [{Key: 'ghr:hibernated', Value: 'false'}]);
}
}

return toResume;
}
106 changes: 65 additions & 41 deletions lambdas/functions/control-plane/src/scale-runners/scale-down.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
import moment from 'moment';

import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners';
import { RunnerInfo, RunnerList } from './../aws/runners.d';
import { bootTimeExceeded, hibernateRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners';
import { RunnerInfo } from './../aws/runners.d';
import { GhRunners, githubCache } from './cache';
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
import { metricGitHubAppRateLimit } from '../github/rate-limit';
import yn from 'yn';

const logger = createChildLogger('scale-down');

Expand Down Expand Up @@ -103,39 +104,65 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
return launchTimePlusMinimum < now;
}

async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[], ephemeral: boolean): Promise<void> {
const githubAppClient = await getOrCreateOctokit(ec2runner);
try {
const states = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
}),
);

if (states.every((busy) => busy === false)) {
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
);

if (statuses.every((status) => status == 204)) {
await terminateRunner(ec2runner.instanceId);
logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
// If ephemeral, still terminate. If not ephemeral, hibernate.
if (ephemeral) {
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
);

if (statuses.every((status) => status == 204)) {
await terminateRunner(ec2runner.instanceId);
logger.debug(`AWS runner instance '${ec2runner.instanceId}' terminated (ephemeral runner).`);
} else {
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
}
} else {
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
// Non-ephemeral runner: Hibernate
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
);
if (statuses.every((status) => status == 204)) {
await hibernateRunner(ec2runner.instanceId);
logger.debug(`AWS runner instance '${ec2runner.instanceId}' is hibernated (non-ephemeral runner).`);
} else {
logger.error(`Failed to de-register GitHub runner before hibernation: ${statuses}`);
}
}
} else {
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
Expand All @@ -155,6 +182,8 @@ async function evaluateAndRemoveRunners(
const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));

const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false });

for (const ownerTag of ownerTags) {
const ec2RunnersFiltered = ec2Runners
.filter((runner) => runner.owner === ownerTag)
Expand All @@ -172,16 +201,19 @@ async function evaluateAndRemoveRunners(
logger.debug(
`GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`,
);
const ephemeral = ephemeralEnabled && process.env.ENABLE_EPHEMERAL_RUNNERS === 'true';

if (ghRunnersFiltered.length) {
if (runnerMinimumTimeExceeded(ec2Runner)) {
if (idleCounter > 0) {
idleCounter--;
logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`);
} else {
logger.info(`Terminating all non busy runners.`);
logger.info(`Terminating or hibernating non busy runners.`);
await removeRunner(
ec2Runner,
ghRunnersFiltered.map((runner: { id: number }) => runner.id),
ephemeral,
);
}
}
Expand Down Expand Up @@ -230,26 +262,18 @@ export function newestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
return oldestFirstStrategy(a, b) * -1;
}

async function listRunners(environment: string) {
return await listEC2Runners({
environment,
});
}

function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
}

export async function scaleDown(): Promise<void> {
githubCache.reset();
const environment = process.env.ENVIRONMENT;
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];

// first runners marked to be orphan.
// first terminate orphan runners
await terminateOrphan(environment);

// next scale down idle runners with respect to config and mark potential orphans
const ec2Runners = await listRunners(environment);
const ec2Runners = await listEC2Runners({
environment,
});
const activeEc2RunnersCount = ec2Runners.length;
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
Expand All @@ -259,9 +283,9 @@ export async function scaleDown(): Promise<void> {
return;
}

const runners = filterRunners(ec2Runners);
const runners = ec2Runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan);
await evaluateAndRemoveRunners(runners, scaleDownConfigs);

const activeEc2RunnersCountAfter = (await listRunners(environment)).length;
const activeEc2RunnersCountAfter = (await listEC2Runners({ environment })).length;
logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`);
}
}
Loading
Loading