Skip to content

Commit 8e00cd9

Browse files
authored
Add hibernation
1 parent 866a1e7 commit 8e00cd9

File tree

3 files changed

+310
-302
lines changed

3 files changed

+310
-302
lines changed

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 74 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@ import {
22
CreateFleetCommand,
33
CreateFleetResult,
44
CreateTagsCommand,
5+
DefaultTargetCapacityType,
56
DescribeInstancesCommand,
67
DescribeInstancesResult,
78
EC2Client,
89
FleetLaunchTemplateOverridesRequest,
910
Tag,
1011
TerminateInstancesCommand,
12+
StopInstancesCommand,
13+
StartInstancesCommand,
1114
_InstanceType,
15+
Filter,
1216
} from '@aws-sdk/client-ec2';
1317
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
1418
import { getTracedAWSV3Client, tracer } from '@aws-github-runner/aws-powertools-util';
@@ -167,15 +171,12 @@ async function processFleetResult(
167171
);
168172
const errors = fleet.Errors?.flatMap((e) => e.ErrorCode || '') || [];
169173

170-
// Educated guess of errors that would make sense to retry based on the list
171-
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
172174
const scaleErrors = [
173175
'UnfulfillableCapacity',
174176
'MaxSpotInstanceCountExceeded',
175177
'TargetCapacityLimitExceededException',
176178
'RequestLimitExceeded',
177179
'ResourceLimitExceeded',
178-
'MaxSpotInstanceCountExceeded',
179180
'MaxSpotFleetRequestCountExceeded',
180181
'InsufficientInstanceCapacity',
181182
];
@@ -184,7 +185,7 @@ async function processFleetResult(
184185
errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) &&
185186
runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot'
186187
) {
187-
logger.warn(`Create fleet failed, initatiing fall back to on demand instances.`);
188+
logger.warn(`Create fleet failed, initiating fall back to on demand instances.`);
188189
logger.debug('Create fleet failed.', { data: fleet.Errors });
189190
const numberOfInstances = runnerParameters.numberOfRunners - instances.length;
190191
const instancesOnDemand = await createRunner({
@@ -218,12 +219,10 @@ async function getAmiIdOverride(runnerParameters: Runners.RunnerInputParameters)
218219
return amiIdOverride;
219220
} catch (e) {
220221
logger.debug(
221-
`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}. ` +
222-
'Please ensure that the given parameter exists on this region and contains a valid runner AMI ID',
222+
`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}.`,
223223
{ error: e },
224224
);
225-
throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName},
226-
${e}`);
225+
throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}, ${e}`);
227226
}
228227
}
229228

@@ -244,54 +243,80 @@ async function createInstances(
244243
tags.push({ Key: 'ghr:trace_id', Value: traceId! });
245244
}
246245

247-
let fleet: CreateFleetResult;
248-
try {
249-
// see for spec https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html
250-
const createFleetCommand = new CreateFleetCommand({
251-
LaunchTemplateConfigs: [
252-
{
253-
LaunchTemplateSpecification: {
254-
LaunchTemplateName: runnerParameters.launchTemplateName,
255-
Version: '$Default',
256-
},
257-
Overrides: generateFleetOverrides(
258-
runnerParameters.subnets,
259-
runnerParameters.ec2instanceCriteria.instanceTypes,
260-
amiIdOverride,
261-
),
246+
const createFleetCommand = new CreateFleetCommand({
247+
LaunchTemplateConfigs: [
248+
{
249+
LaunchTemplateSpecification: {
250+
LaunchTemplateName: runnerParameters.launchTemplateName,
251+
Version: '$Default',
262252
},
263-
],
264-
SpotOptions: {
265-
MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice,
266-
AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy,
253+
Overrides: generateFleetOverrides(
254+
runnerParameters.subnets,
255+
runnerParameters.ec2instanceCriteria.instanceTypes,
256+
amiIdOverride,
257+
),
267258
},
268-
TargetCapacitySpecification: {
269-
TotalTargetCapacity: runnerParameters.numberOfRunners,
270-
DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType,
259+
],
260+
SpotOptions: {
261+
MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice,
262+
AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy,
263+
},
264+
TargetCapacitySpecification: {
265+
TotalTargetCapacity: runnerParameters.numberOfRunners,
266+
DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType,
267+
},
268+
TagSpecifications: [
269+
{
270+
ResourceType: 'instance',
271+
Tags: tags,
271272
},
272-
TagSpecifications: [
273-
{
274-
ResourceType: 'instance',
275-
Tags: tags,
276-
},
277-
{
278-
ResourceType: 'volume',
279-
Tags: tags,
280-
},
281-
],
282-
Type: 'instant',
283-
});
284-
fleet = await ec2Client.send(createFleetCommand);
285-
} catch (e) {
286-
logger.warn('Create fleet request failed.', { error: e as Error });
287-
throw e;
288-
}
289-
return fleet;
273+
{
274+
ResourceType: 'volume',
275+
Tags: tags,
276+
},
277+
],
278+
Type: 'instant',
279+
});
280+
return await ec2Client.send(createFleetCommand);
290281
}
291282

292-
// If launchTime is undefined, this will return false
293283
export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean {
294284
const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
295285
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
296286
return launchTimePlusBootTime < moment(new Date()).utc();
297287
}
288+
289+
// New: Hibernate the runner instance
290+
export async function hibernateRunner(instanceId: string): Promise<void> {
291+
logger.debug(`Runner '${instanceId}' will be hibernated (stopped with hibernation).`);
292+
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
293+
// Note: Stopping an instance that supports hibernation and has hibernation configured will hibernate it.
294+
await ec2.send(new StopInstancesCommand({ InstanceIds: [instanceId], Hibernate: true }));
295+
await tag(instanceId, [{ Key: 'ghr:hibernated', Value: 'true' }]);
296+
logger.debug(`Runner ${instanceId} has been hibernated.`);
297+
}
298+
299+
// Helper function to resume hibernated instances
300+
export async function resumeHibernatedInstances(count: number): Promise<string[]> {
301+
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
302+
const filters: Filter[] = [
303+
{ Name: 'instance-state-name', Values: ['stopped'] },
304+
{ Name: 'tag:ghr:hibernated', Values: ['true'] },
305+
{ Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
306+
];
307+
308+
const desc = await ec2.send(new DescribeInstancesCommand({ Filters: filters }));
309+
const stoppedInstances = desc.Reservations?.flatMap(r => r.Instances?.map(i => i.InstanceId!) || []) || [];
310+
311+
const toResume = stoppedInstances.slice(0, count);
312+
if (toResume.length > 0) {
313+
logger.info(`Resuming hibernated instances: ${toResume.join(',')}`);
314+
await ec2.send(new StartInstancesCommand({ InstanceIds: toResume }));
315+
// Optionally remove hibernated tag after start
316+
for (const instanceId of toResume) {
317+
await tag(instanceId, [{Key: 'ghr:hibernated', Value: 'false'}]);
318+
}
319+
}
320+
321+
return toResume;
322+
}

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 65 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
33
import moment from 'moment';
44

55
import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
6-
import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners';
7-
import { RunnerInfo, RunnerList } from './../aws/runners.d';
6+
import { bootTimeExceeded, hibernateRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners';
7+
import { RunnerInfo } from './../aws/runners.d';
88
import { GhRunners, githubCache } from './cache';
99
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
1010
import { metricGitHubAppRateLimit } from '../github/rate-limit';
11+
import yn from 'yn';
1112

1213
const logger = createChildLogger('scale-down');
1314

@@ -103,39 +104,65 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
103104
return launchTimePlusMinimum < now;
104105
}
105106

106-
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
107+
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[], ephemeral: boolean): Promise<void> {
107108
const githubAppClient = await getOrCreateOctokit(ec2runner);
108109
try {
109110
const states = await Promise.all(
110111
ghRunnerIds.map(async (ghRunnerId) => {
111-
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
112112
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
113113
}),
114114
);
115115

116116
if (states.every((busy) => busy === false)) {
117-
const statuses = await Promise.all(
118-
ghRunnerIds.map(async (ghRunnerId) => {
119-
return (
120-
ec2runner.type === 'Org'
121-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
122-
runner_id: ghRunnerId,
123-
org: ec2runner.owner,
124-
})
125-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
126-
runner_id: ghRunnerId,
127-
owner: ec2runner.owner.split('/')[0],
128-
repo: ec2runner.owner.split('/')[1],
129-
})
130-
).status;
131-
}),
132-
);
133-
134-
if (statuses.every((status) => status == 204)) {
135-
await terminateRunner(ec2runner.instanceId);
136-
logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
117+
// If ephemeral, still terminate. If not ephemeral, hibernate.
118+
if (ephemeral) {
119+
const statuses = await Promise.all(
120+
ghRunnerIds.map(async (ghRunnerId) => {
121+
return (
122+
ec2runner.type === 'Org'
123+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
124+
runner_id: ghRunnerId,
125+
org: ec2runner.owner,
126+
})
127+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
128+
runner_id: ghRunnerId,
129+
owner: ec2runner.owner.split('/')[0],
130+
repo: ec2runner.owner.split('/')[1],
131+
})
132+
).status;
133+
}),
134+
);
135+
136+
if (statuses.every((status) => status == 204)) {
137+
await terminateRunner(ec2runner.instanceId);
138+
logger.debug(`AWS runner instance '${ec2runner.instanceId}' terminated (ephemeral runner).`);
139+
} else {
140+
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
141+
}
137142
} else {
138-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
143+
// Non-ephemeral runner: Hibernate
144+
const statuses = await Promise.all(
145+
ghRunnerIds.map(async (ghRunnerId) => {
146+
return (
147+
ec2runner.type === 'Org'
148+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
149+
runner_id: ghRunnerId,
150+
org: ec2runner.owner,
151+
})
152+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
153+
runner_id: ghRunnerId,
154+
owner: ec2runner.owner.split('/')[0],
155+
repo: ec2runner.owner.split('/')[1],
156+
})
157+
).status;
158+
}),
159+
);
160+
if (statuses.every((status) => status == 204)) {
161+
await hibernateRunner(ec2runner.instanceId);
162+
logger.debug(`AWS runner instance '${ec2runner.instanceId}' is hibernated (non-ephemeral runner).`);
163+
} else {
164+
logger.error(`Failed to de-register GitHub runner before hibernation: ${statuses}`);
165+
}
139166
}
140167
} else {
141168
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
@@ -155,6 +182,8 @@ async function evaluateAndRemoveRunners(
155182
const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
156183
const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));
157184

185+
const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false });
186+
158187
for (const ownerTag of ownerTags) {
159188
const ec2RunnersFiltered = ec2Runners
160189
.filter((runner) => runner.owner === ownerTag)
@@ -172,16 +201,19 @@ async function evaluateAndRemoveRunners(
172201
logger.debug(
173202
`GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`,
174203
);
204+
const ephemeral = ephemeralEnabled && process.env.ENABLE_EPHEMERAL_RUNNERS === 'true';
205+
175206
if (ghRunnersFiltered.length) {
176207
if (runnerMinimumTimeExceeded(ec2Runner)) {
177208
if (idleCounter > 0) {
178209
idleCounter--;
179210
logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`);
180211
} else {
181-
logger.info(`Terminating all non busy runners.`);
212+
logger.info(`Terminating or hibernating non busy runners.`);
182213
await removeRunner(
183214
ec2Runner,
184215
ghRunnersFiltered.map((runner: { id: number }) => runner.id),
216+
ephemeral,
185217
);
186218
}
187219
}
@@ -230,26 +262,18 @@ export function newestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
230262
return oldestFirstStrategy(a, b) * -1;
231263
}
232264

233-
async function listRunners(environment: string) {
234-
return await listEC2Runners({
235-
environment,
236-
});
237-
}
238-
239-
function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
240-
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
241-
}
242-
243265
export async function scaleDown(): Promise<void> {
244266
githubCache.reset();
245267
const environment = process.env.ENVIRONMENT;
246268
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
247269

248-
// first runners marked to be orphan.
270+
// first terminate orphan runners
249271
await terminateOrphan(environment);
250272

251273
// next scale down idle runners with respect to config and mark potential orphans
252-
const ec2Runners = await listRunners(environment);
274+
const ec2Runners = await listEC2Runners({
275+
environment,
276+
});
253277
const activeEc2RunnersCount = ec2Runners.length;
254278
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
255279
logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
@@ -259,9 +283,9 @@ export async function scaleDown(): Promise<void> {
259283
return;
260284
}
261285

262-
const runners = filterRunners(ec2Runners);
286+
const runners = ec2Runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan);
263287
await evaluateAndRemoveRunners(runners, scaleDownConfigs);
264288

265-
const activeEc2RunnersCountAfter = (await listRunners(environment)).length;
289+
const activeEc2RunnersCountAfter = (await listEC2Runners({ environment })).length;
266290
logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`);
267-
}
291+
}

0 commit comments

Comments
 (0)