Skip to content

Commit da2dd95

Browse files
committed
runners: Add scaleCycle lambda to reuse runners
This lambda will attempt to reuse runners that have finished jobs that are sitting idle. Plan is to have this run in AWS on a cron. The functionality within this lambda will eventually replace the tryReuseRunner function in scale-up.ts. Signed-off-by: Eli Uriegas <[email protected]> ghstack-source-id: bc13e21 ghstack-comment-id: 3046547816 Pull-Request: #6892
1 parent 12bc70b commit da2dd95

File tree

4 files changed

+136
-2
lines changed

4 files changed

+136
-2
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ import {
77
ScaleUpChronMetrics,
88
sendMetricsAtTimeout,
99
sendMetricsTimeoutVars,
10+
ScaleCycleMetrics,
1011
} from './scale-runners/metrics';
1112
import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils';
1213
import { scaleDown as scaleDownR } from './scale-runners/scale-down';
1314
import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron';
1415
import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs';
15-
16+
import { scaleCycle as scaleCycleR } from './scale-runners/scale-cycle';
17+
1618
async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) {
1719
console.error(`Detected ${evtFailed.length} errors when processing messages, will retry relevant messages.`);
1820
metrics.exception();
@@ -202,3 +204,37 @@ export async function scaleUpChron(event: ScheduledEvent, context: Context, call
202204
}
203205
callback(callbackOutput);
204206
}
207+
208+
export async function scaleCycle(event: ScheduledEvent, context: Context, callback: any) {
209+
// we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent
210+
context.callbackWaitsForEmptyEventLoop = false;
211+
212+
const metrics = new ScaleCycleMetrics();
213+
const sndMetricsTimout: sendMetricsTimeoutVars = {
214+
metrics: metrics,
215+
};
216+
sndMetricsTimout.setTimeout = setTimeout(
217+
sendMetricsAtTimeout(sndMetricsTimout),
218+
(Config.Instance.lambdaTimeout - 10) * 1000,
219+
);
220+
221+
let callbackOutput: string | null = null;
222+
223+
try {
224+
await scaleCycleR(metrics);
225+
} catch (e) {
226+
console.error(e);
227+
callbackOutput = `Failed to scale cycle: ${e}`;
228+
} finally {
229+
try {
230+
clearTimeout(sndMetricsTimout.setTimeout);
231+
sndMetricsTimout.metrics = undefined;
232+
sndMetricsTimout.setTimeout = undefined;
233+
await metrics.sendMetrics();
234+
} catch (e) {
235+
callbackOutput = `Error sending metrics: ${e}`;
236+
}
237+
}
238+
239+
callback(callbackOutput);
240+
}

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,3 +1813,24 @@ export function sendMetricsAtTimeout(metricsTimeouts: sendMetricsTimeoutVars) {
18131813
}
18141814
};
18151815
}
1816+
1817+
export class ScaleCycleMetrics extends ScaleUpMetrics {
1818+
constructor() {
1819+
super('scaleCycle');
1820+
}
1821+
1822+
scaleCycleRunnerReuseFound(runnerType: string) {
1823+
const dimensions = new Map([['RunnerType', runnerType]]);
1824+
this.countEntry('run.scaleCycle.runnerReuse.found', 1, dimensions);
1825+
}
1826+
1827+
scaleCycleRunnerReuseFoundOrg(org: string, runnerType: string) {
1828+
const dimensions = new Map([['Org', org], ['RunnerType', runnerType]]);
1829+
this.countEntry('run.scaleCycle.runnerReuse.found.org', 1, dimensions);
1830+
}
1831+
1832+
scaleCycleRunnerReuseFoundRepo(repo: string, runnerType: string) {
1833+
const dimensions = new Map([['Repo', repo], ['RunnerType', runnerType]]);
1834+
this.countEntry('run.scaleCycle.runnerReuse.found.repo', 1, dimensions);
1835+
}
1836+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { Config } from './config';
2+
import { listRunners, RunnerInputParameters, RunnerType, tryReuseRunner } from './runners';
3+
import { getRepo, getRepoKey, Repo } from './utils';
4+
import { ScaleCycleMetrics } from './metrics';
5+
import { getRunnerTypes } from './gh-runners';
6+
import { createRunnerConfigArgument } from './scale-up';
7+
8+
export async function scaleCycle(metrics: ScaleCycleMetrics) {
9+
// Get runner types configuration first
10+
const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo);
11+
const runnerTypes = await getRunnerTypes(scaleConfigRepo, metrics);
12+
13+
// Get all valid runner type names for filtering
14+
const validRunnerTypeNames = Array.from(runnerTypes.keys());
15+
16+
// Make separate calls for each runner type to filter at EC2 level
17+
const allRunners = await Promise.all(
18+
validRunnerTypeNames.map(runnerTypeName =>
19+
listRunners(metrics, {
20+
containsTags: ['GithubRunnerID', 'EphemeralRunnerFinished', 'RunnerType'],
21+
runnerType: runnerTypeName,
22+
})
23+
)
24+
);
25+
26+
// Flatten the results
27+
const runners = allRunners.flat();
28+
29+
for (const runner of runners) {
30+
// Skip if required fields are missing (org/repo still need to be checked)
31+
if (!runner.runnerType || !runner.org || !runner.repo) {
32+
console.warn(`Skipping runner ${runner.instanceId} due to missing required tags`);
33+
continue;
34+
}
35+
36+
// Get the RunnerType object from the string (we know it exists since we filtered by it)
37+
const runnerType = runnerTypes.get(runner.runnerType);
38+
if (!runnerType) {
39+
console.warn(`Unknown runner type: ${runner.runnerType}, skipping`);
40+
continue;
41+
}
42+
43+
// Create repo object
44+
const repo = getRepo(runner.org, runner.repo);
45+
46+
// For each runner send an EBS volume replacement task
47+
const runnerInputParameters: RunnerInputParameters = {
48+
runnerConfig: (awsRegion: string, experimentalRunner: boolean) => {
49+
return createRunnerConfigArgument(
50+
runnerType,
51+
repo,
52+
// NOTE: installationId can actually be undefined here but this may incur lower rate limits
53+
// TODO: figure out if we need to pass an actual installationId here
54+
undefined,
55+
metrics,
56+
awsRegion,
57+
experimentalRunner,
58+
);
59+
},
60+
environment: Config.Instance.environment,
61+
runnerType: runnerType,
62+
};
63+
64+
// Set orgName or repoName based on configuration
65+
if (Config.Instance.enableOrganizationRunners) {
66+
runnerInputParameters.orgName = runner.org;
67+
metrics.scaleCycleRunnerReuseFoundOrg(runner.org, runner.runnerType);
68+
console.info(`Reusing runner ${runner.instanceId} for ${runner.org}`);
69+
} else {
70+
runnerInputParameters.repoName = getRepoKey(repo);
71+
metrics.scaleCycleRunnerReuseFoundRepo(getRepoKey(repo), runner.runnerType);
72+
console.info(`Reusing runner ${runner.instanceId} for ${getRepoKey(repo)}`);
73+
}
74+
75+
await tryReuseRunner(runnerInputParameters, metrics);
76+
}
77+
}

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ export async function scaleUp(
163163
}
164164
}
165165

166-
async function createRunnerConfigArgument(
166+
export async function createRunnerConfigArgument(
167167
runnerType: RunnerType,
168168
repo: Repo,
169169
installationId: number | undefined,

0 commit comments

Comments
 (0)