Skip to content

Commit 4556a13

Browse files
authored
runners: More scale-down perf improvements (#6854)
Does the following: * Removes ssm parameter cleanup from terminateInstances (will be a follow-up PR to add a termination policy to parameters) * Removed double check for ghRunner calls (was causing performance bottlenecks * NOTE: We will need to monitor removeGHRunnerOrg calls to see if those introduce another performance bottleneck + job cancellations (if they rise then we revert, dashboard: https://hud.pytorch.org/job_cancellation_dashboard) Signed-off-by: Eli Uriegas <[email protected]> --------- Signed-off-by: Eli Uriegas <[email protected]>
1 parent 003bee0 commit 4556a13

File tree

5 files changed

+35
-165
lines changed

5 files changed

+35
-165
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -355,20 +355,12 @@ describe('terminateRunners', () => {
355355
},
356356
];
357357

358-
mockSSMdescribeParametersRet.mockResolvedValueOnce({
359-
Parameters: runners
360-
.map((runner) => getParameterNameForRunner(runner.environment as string, runner.instanceId))
361-
.map((s) => ({ Name: s })),
362-
});
363-
364358
await terminateRunners(runners, metrics);
365359

366360
expect(mockEC2.terminateInstances).toBeCalledTimes(1);
367361
expect(mockEC2.terminateInstances).toBeCalledWith({
368362
InstanceIds: ['i-1234', 'i-5678'],
369363
});
370-
expect(mockSSM.describeParameters).toBeCalledTimes(1);
371-
expect(mockSSM.deleteParameter).toBeCalledTimes(2);
372364
});
373365

374366
it('terminates runners across multiple regions', async () => {
@@ -385,10 +377,6 @@ describe('terminateRunners', () => {
385377
},
386378
];
387379

388-
mockSSMdescribeParametersRet.mockResolvedValue({
389-
Parameters: [{ Name: 'gi-ci-i-1234' }, { Name: 'gi-ci-i-5678' }],
390-
});
391-
392380
await terminateRunners(runners, metrics);
393381

394382
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
@@ -398,8 +386,6 @@ describe('terminateRunners', () => {
398386
expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(2, {
399387
InstanceIds: ['i-5678'],
400388
});
401-
expect(mockSSM.describeParameters).toBeCalledTimes(2);
402-
expect(mockSSM.deleteParameter).toBeCalledTimes(2);
403389
});
404390

405391
it('handles partial failure - terminates some runners but fails on others', async () => {
@@ -421,16 +407,6 @@ describe('terminateRunners', () => {
421407
},
422408
];
423409

424-
// First region succeeds
425-
mockSSMdescribeParametersRet.mockResolvedValueOnce({
426-
Parameters: [{ Name: 'gi-ci-i-1234' }, { Name: 'gi-ci-i-5678' }],
427-
});
428-
429-
// Second region also gets SSM parameters but has no successful terminations to clean up
430-
mockSSMdescribeParametersRet.mockResolvedValueOnce({
431-
Parameters: [],
432-
});
433-
434410
// First region succeeds, second region fails
435411
mockEC2.terminateInstances
436412
.mockReturnValueOnce({
@@ -445,8 +421,6 @@ describe('terminateRunners', () => {
445421
);
446422

447423
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
448-
expect(mockSSM.describeParameters).toBeCalledTimes(2); // Called for both regions
449-
expect(mockSSM.deleteParameter).toBeCalledTimes(2); // Only for successful region
450424
});
451425

452426
it('handles large batches by splitting into chunks', async () => {
@@ -457,12 +431,6 @@ describe('terminateRunners', () => {
457431
environment: 'gi-ci',
458432
}));
459433

460-
mockSSMdescribeParametersRet.mockResolvedValueOnce({
461-
Parameters: runners.map((runner) => ({
462-
Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
463-
})),
464-
});
465-
466434
await terminateRunners(runners, metrics);
467435

468436
// Should make 2 terminate calls (batches of 100 and 50)
@@ -473,10 +441,6 @@ describe('terminateRunners', () => {
473441
expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(2, {
474442
InstanceIds: runners.slice(100, 150).map((r) => r.instanceId),
475443
});
476-
477-
// SSM cleanup should handle all 150 parameters
478-
expect(mockSSM.describeParameters).toBeCalledTimes(1);
479-
expect(mockSSM.deleteParameter).toBeCalledTimes(150);
480444
});
481445

482446
it('cleans up SSM parameters for successful batches even when later batch fails', async () => {
@@ -487,12 +451,6 @@ describe('terminateRunners', () => {
487451
environment: 'gi-ci',
488452
}));
489453

490-
mockSSMdescribeParametersRet.mockResolvedValueOnce({
491-
Parameters: runners.slice(0, 100).map((runner) => ({
492-
Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
493-
})),
494-
});
495-
496454
// First batch succeeds, second batch fails
497455
mockEC2.terminateInstances
498456
.mockReturnValueOnce({
@@ -505,9 +463,6 @@ describe('terminateRunners', () => {
505463
await expect(terminateRunners(runners, metrics)).rejects.toThrow('Failed to terminate some runners');
506464

507465
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
508-
// SSM cleanup should still happen for the first 100 runners that were successfully terminated
509-
expect(mockSSM.describeParameters).toBeCalledTimes(1);
510-
expect(mockSSM.deleteParameter).toBeCalledTimes(100);
511466
});
512467

513468
it('handles SSM parameter cleanup failure gracefully', async () => {
@@ -519,18 +474,9 @@ describe('terminateRunners', () => {
519474
},
520475
];
521476

522-
// SSM describe fails, so it should attempt direct deletion
523-
mockSSMdescribeParametersRet.mockRejectedValueOnce(new Error('SSM describe failed'));
524-
525477
await terminateRunners(runners, metrics);
526478

527479
expect(mockEC2.terminateInstances).toBeCalledTimes(1);
528-
expect(mockSSM.describeParameters).toBeCalledTimes(1);
529-
// Should still attempt direct deletion even when describe fails
530-
expect(mockSSM.deleteParameter).toBeCalledTimes(1);
531-
expect(mockSSM.deleteParameter).toBeCalledWith({
532-
Name: getParameterNameForRunner(runners[0].environment as string, runners[0].instanceId),
533-
});
534480
});
535481
});
536482

@@ -1779,9 +1725,6 @@ describe('terminateRunner', () => {
17791725
};
17801726

17811727
// Mock terminateRunners by mocking the underlying calls
1782-
mockSSMdescribeParametersRet.mockResolvedValueOnce({
1783-
Parameters: [{ Name: 'gi-ci-i-1234' }],
1784-
});
17851728
mockEC2.terminateInstances.mockReturnValueOnce({
17861729
promise: jest.fn().mockResolvedValueOnce({}),
17871730
});
@@ -1792,8 +1735,5 @@ describe('terminateRunner', () => {
17921735
expect(mockEC2.terminateInstances).toBeCalledWith({
17931736
InstanceIds: ['i-1234'],
17941737
});
1795-
expect(mockSSM.deleteParameter).toBeCalledWith({
1796-
Name: 'gi-ci-i-1234',
1797-
});
17981738
});
17991739
});

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts

Lines changed: 31 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -371,53 +371,42 @@ async function terminateRunnersInRegion(runners: RunnerInfo[], metrics: Metrics,
371371
// We'll attempt to terminate all batches, but if any batch throws we still want to clean up the SSM
372372
// parameters for the instances that were already terminated. To achieve this we wrap the whole
373373
// operation in a try / finally block so that the cleanup always executes.
374-
try {
375-
for (const [batchIndex, instanceBatch] of instanceBatches.entries()) {
376-
console.info(
377-
`[${region}] Processing batch ${batchIndex + 1}/${instanceBatches.length} with ${
378-
instanceBatch.length
379-
} instances: ${instanceBatch.map((r) => r.instanceId).join(', ')}`,
380-
);
381-
382-
try {
383-
await expBackOff(() => {
384-
return metrics.trackRequestRegion(
385-
region,
386-
metrics.ec2TerminateInstancesAWSCallSuccess,
387-
metrics.ec2TerminateInstancesAWSCallFailure,
388-
() => {
389-
return ec2.terminateInstances({ InstanceIds: instanceBatch.map((r) => r.instanceId) }).promise();
390-
},
391-
);
392-
});
374+
for (const [batchIndex, instanceBatch] of instanceBatches.entries()) {
375+
console.info(
376+
`[${region}] Processing batch ${batchIndex + 1}/${instanceBatches.length} with ${
377+
instanceBatch.length
378+
} instances: ${instanceBatch.map((r) => r.instanceId).join(', ')}`,
379+
);
393380

394-
console.info(
395-
`[${region}] Successfully terminated batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
396-
.map((r) => r.instanceId)
397-
.join(', ')}`,
381+
try {
382+
await expBackOff(() => {
383+
return metrics.trackRequestRegion(
384+
region,
385+
metrics.ec2TerminateInstancesAWSCallSuccess,
386+
metrics.ec2TerminateInstancesAWSCallFailure,
387+
() => {
388+
return ec2.terminateInstances({ InstanceIds: instanceBatch.map((r) => r.instanceId) }).promise();
389+
},
398390
);
391+
});
399392

400-
// Record successfully terminated runners so that we can clean up their SSM parameters later.
401-
successfullyTerminated.push(...instanceBatch);
402-
} catch (e) {
403-
console.error(
404-
`[${region}] Failed to terminate batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
405-
.map((r) => r.instanceId)
406-
.join(', ')} - ${e}`,
407-
);
408-
// Re-throw so that callers are aware of the failure; the finally block will still execute and
409-
// attempt SSM cleanup for the instances that were already terminated.
410-
throw e;
411-
}
412-
}
413-
} finally {
414-
try {
415-
await cleanupSSMParametersForRunners(successfullyTerminated, metrics, region, ssm);
416-
} catch (cleanupErr) {
417-
// We do not want cleanup issues to mask the original termination error, just log them.
393+
console.info(
394+
`[${region}] Successfully terminated batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
395+
.map((r) => r.instanceId)
396+
.join(', ')}`,
397+
);
398+
399+
// Record successfully terminated runners so that we can clean up their SSM parameters later.
400+
successfullyTerminated.push(...instanceBatch);
401+
} catch (e) {
418402
console.error(
419-
`[${region}] Error during SSM parameter cleanup for ${successfullyTerminated.length} runners: ${cleanupErr}`,
403+
`[${region}] Failed to terminate batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
404+
.map((r) => r.instanceId)
405+
.join(', ')} - ${e}`,
420406
);
407+
// Re-throw so that callers are aware of the failure; the finally block will still execute and
408+
// attempt SSM cleanup for the instances that were already terminated.
409+
throw e;
421410
}
422411
}
423412
}

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,30 +1448,24 @@ describe('scale-down', () => {
14481448

14491449
it('dont finds on listGithubRunnersRep, finds with getRunnerRepo, busy === false', async () => {
14501450
const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo);
1451-
const mockedGetRunnerRepo = mocked(getRunnerRepo);
14521451
const ec2runner: RunnerInfo = {
14531452
awsRegion: baseConfig.awsRegion,
14541453
repo: repoKey,
14551454
instanceId: 'instance-id-03',
14561455
runnerType: 'runnerType-01',
14571456
ghRunnerId: 'ghRunnerId-01',
14581457
};
1459-
const theGhRunner = { name: 'instance-id-03', busy: false } as GhRunner;
14601458

14611459
mockedListGithubRunnersRepo.mockResolvedValueOnce(ghRunners);
1462-
mockedGetRunnerRepo.mockResolvedValueOnce(theGhRunner);
14631460

1464-
expect(await getGHRunnerRepo(ec2runner, metrics)).toEqual(theGhRunner);
1461+
expect(await getGHRunnerRepo(ec2runner, metrics)).toBeUndefined();
14651462

14661463
expect(mockedListGithubRunnersRepo).toBeCalledTimes(1);
14671464
expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics);
1468-
expect(mockedGetRunnerRepo).toBeCalledTimes(1);
1469-
expect(mockedGetRunnerRepo).toBeCalledWith(repo, ec2runner.ghRunnerId, metrics);
14701465
});
14711466

14721467
it('listGithubRunnersRep and getRunnerRepo throws exception', async () => {
14731468
const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo);
1474-
const mockedGetRunnerRepo = mocked(getRunnerRepo);
14751469
const ec2runner: RunnerInfo = {
14761470
awsRegion: baseConfig.awsRegion,
14771471
repo: repoKey,
@@ -1481,14 +1475,11 @@ describe('scale-down', () => {
14811475
};
14821476

14831477
mockedListGithubRunnersRepo.mockRejectedValueOnce('Error');
1484-
mockedGetRunnerRepo.mockRejectedValueOnce('Error');
14851478

14861479
expect(await getGHRunnerRepo(ec2runner, metrics)).toBeUndefined();
14871480

14881481
expect(mockedListGithubRunnersRepo).toBeCalledTimes(1);
14891482
expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics);
1490-
expect(mockedGetRunnerRepo).toBeCalledTimes(1);
1491-
expect(mockedGetRunnerRepo).toBeCalledWith(repo, ec2runner.ghRunnerId, metrics);
14921483
});
14931484
});
14941485

@@ -1635,30 +1626,24 @@ describe('scale-down', () => {
16351626

16361627
it('dont finds on listGithubRunnersOrg, finds with getRunnerOrg, busy === false', async () => {
16371628
const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg);
1638-
const mockedGetRunnerOrg = mocked(getRunnerOrg);
16391629
const ec2runner: RunnerInfo = {
16401630
awsRegion: baseConfig.awsRegion,
16411631
org: org,
16421632
instanceId: 'instance-id-03',
16431633
runnerType: 'runnerType-01',
16441634
ghRunnerId: 'ghRunnerId-01',
16451635
};
1646-
const theGhRunner = { name: 'instance-id-03', busy: false } as GhRunner;
16471636

16481637
mockedListGithubRunnersOrg.mockResolvedValueOnce(ghRunners);
1649-
mockedGetRunnerOrg.mockResolvedValueOnce(theGhRunner);
16501638

1651-
expect(await getGHRunnerOrg(ec2runner, metrics)).toEqual(theGhRunner);
1639+
expect(await getGHRunnerOrg(ec2runner, metrics)).toBeUndefined();
16521640

16531641
expect(mockedListGithubRunnersOrg).toBeCalledTimes(1);
16541642
expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics);
1655-
expect(mockedGetRunnerOrg).toBeCalledTimes(1);
1656-
expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics);
16571643
});
16581644

16591645
it('listGithubRunnersRep and getRunnerRepo throws exception', async () => {
16601646
const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg);
1661-
const mockedGetRunnerOrg = mocked(getRunnerOrg);
16621647
const ec2runner: RunnerInfo = {
16631648
awsRegion: baseConfig.awsRegion,
16641649
org: org,
@@ -1668,14 +1653,11 @@ describe('scale-down', () => {
16681653
};
16691654

16701655
mockedListGithubRunnersOrg.mockRejectedValueOnce('Error');
1671-
mockedGetRunnerOrg.mockRejectedValueOnce('Error');
16721656

16731657
expect(await getGHRunnerOrg(ec2runner, metrics)).toBeUndefined();
16741658

16751659
expect(mockedListGithubRunnersOrg).toBeCalledTimes(1);
16761660
expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics);
1677-
expect(mockedGetRunnerOrg).toBeCalledTimes(1);
1678-
expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics);
16791661
});
16801662

16811663
it('getRunner throws when api rate limit is hit', async () => {

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -298,24 +298,6 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe
298298
}
299299
}
300300

301-
if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) {
302-
console.warn(
303-
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) not found in ` +
304-
`listGithubRunnersOrg call, attempting to grab directly`,
305-
);
306-
try {
307-
ghRunner = await getRunnerOrg(ec2runner.org as string, ec2runner.ghRunnerId, metrics);
308-
} catch (e) {
309-
console.warn(
310-
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when ` +
311-
`listGithubRunnersOrg call: ${e}`,
312-
);
313-
/* istanbul ignore next */
314-
if (isGHRateLimitError(e)) {
315-
throw e;
316-
}
317-
}
318-
}
319301
if (ghRunner) {
320302
if (ghRunner.busy) {
321303
metrics.runnerGhFoundBusyOrg(org, ec2runner);
@@ -343,31 +325,6 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM
343325
}
344326
}
345327

346-
if (ghRunner === undefined) {
347-
if (ec2runner.ghRunnerId === undefined) {
348-
console.warn(
349-
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) was neither found in ` +
350-
`the list of runners returned by the listGithubRunnersRepo api call, nor did it have the ` +
351-
`GithubRunnerId EC2 tag set. This can happen if there's no runner running on the instance.`,
352-
);
353-
} else {
354-
console.warn(
355-
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) not found in ` +
356-
`listGithubRunnersRepo call, attempting to grab directly`,
357-
);
358-
try {
359-
ghRunner = await getRunnerRepo(repo, ec2runner.ghRunnerId, metrics);
360-
} catch (e) {
361-
console.warn(
362-
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) error when getRunnerRepo call: ${e}`,
363-
);
364-
/* istanbul ignore next */
365-
if (isGHRateLimitError(e)) {
366-
throw e;
367-
}
368-
}
369-
}
370-
}
371328
if (ghRunner !== undefined) {
372329
if (ghRunner.busy) {
373330
metrics.runnerGhFoundBusyRepo(repo, ec2runner);

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ export async function expBackOff<T>(
7878
if (expBackOffMs > maxMs) {
7979
throw e;
8080
}
81+
const functionName = callback.name || 'anonymous';
82+
console.warn(`[expBackOff]"${functionName}" needing to back off for function for ${expBackOffMs}ms: ${e}`);
8183
await new Promise((resolve) => setTimeout(resolve, expBackOffMs));
8284
expBackOffMs = expBackOffMs * backOffFactor;
8385
} else {

0 commit comments

Comments
 (0)