Skip to content

Commit 003bee0

Browse files
authored
runners: Add batching for terminateRunner (#6852)
I had noticed that we were terminating instances 1 by 1 in the original code so this adds batching for terminateRunner calls in order to fix those performance bottlenecks. As well during the termination we were deleting ssm parameters one by one so this also adds batching to the ssm parameter deletion as well. Goal here was to implement the performance improvements with minimal changes. This PR super-cedes #6725 --------- Signed-off-by: Eli Uriegas <[email protected]> Signed-off-by: Eli Uriegas <[email protected]>
1 parent 447456b commit 003bee0

File tree

4 files changed

+446
-142
lines changed

4 files changed

+446
-142
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts

Lines changed: 216 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
resetRunnersCaches,
99
terminateRunner,
1010
tryReuseRunner,
11+
terminateRunners,
1112
} from './runners';
1213
import { RunnerInfo } from './utils';
1314
import { ScaleUpMetrics } from './metrics';
@@ -326,10 +327,11 @@ describe('listSSMParameters', () => {
326327
});
327328
});
328329

329-
describe('terminateRunner', () => {
330+
describe('terminateRunners', () => {
330331
beforeEach(() => {
331332
mockSSMdescribeParametersRet.mockClear();
332333
mockEC2.terminateInstances.mockClear();
334+
mockSSM.deleteParameter.mockClear();
333335
const config = {
334336
environment: 'gi-ci',
335337
minimumRunningTimeInMinutes: 45,
@@ -339,66 +341,195 @@ describe('terminateRunner', () => {
339341
resetRunnersCaches();
340342
});
341343

342-
it('calls terminateInstances', async () => {
343-
const runner: RunnerInfo = {
344-
awsRegion: Config.Instance.awsRegion,
345-
instanceId: 'i-1234',
346-
environment: 'gi-ci',
347-
};
344+
it('terminates multiple runners in same region successfully', async () => {
345+
const runners: RunnerInfo[] = [
346+
{
347+
awsRegion: 'us-east-1',
348+
instanceId: 'i-1234',
349+
environment: 'gi-ci',
350+
},
351+
{
352+
awsRegion: 'us-east-1',
353+
instanceId: 'i-5678',
354+
environment: 'gi-ci',
355+
},
356+
];
357+
348358
mockSSMdescribeParametersRet.mockResolvedValueOnce({
349-
Parameters: [getParameterNameForRunner(runner.environment as string, runner.instanceId)].map((s) => {
350-
return { Name: s };
351-
}),
359+
Parameters: runners
360+
.map((runner) => getParameterNameForRunner(runner.environment as string, runner.instanceId))
361+
.map((s) => ({ Name: s })),
352362
});
353-
await terminateRunner(runner, metrics);
354363

364+
await terminateRunners(runners, metrics);
365+
366+
expect(mockEC2.terminateInstances).toBeCalledTimes(1);
355367
expect(mockEC2.terminateInstances).toBeCalledWith({
356-
InstanceIds: [runner.instanceId],
368+
InstanceIds: ['i-1234', 'i-5678'],
357369
});
358370
expect(mockSSM.describeParameters).toBeCalledTimes(1);
359-
expect(mockSSM.deleteParameter).toBeCalledTimes(1);
360-
expect(mockSSM.deleteParameter).toBeCalledWith({
361-
Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
371+
expect(mockSSM.deleteParameter).toBeCalledTimes(2);
372+
});
373+
374+
it('terminates runners across multiple regions', async () => {
375+
const runners: RunnerInfo[] = [
376+
{
377+
awsRegion: 'us-east-1',
378+
instanceId: 'i-1234',
379+
environment: 'gi-ci',
380+
},
381+
{
382+
awsRegion: 'us-west-2',
383+
instanceId: 'i-5678',
384+
environment: 'gi-ci',
385+
},
386+
];
387+
388+
mockSSMdescribeParametersRet.mockResolvedValue({
389+
Parameters: [{ Name: 'gi-ci-i-1234' }, { Name: 'gi-ci-i-5678' }],
390+
});
391+
392+
await terminateRunners(runners, metrics);
393+
394+
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
395+
expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(1, {
396+
InstanceIds: ['i-1234'],
362397
});
398+
expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(2, {
399+
InstanceIds: ['i-5678'],
400+
});
401+
expect(mockSSM.describeParameters).toBeCalledTimes(2);
402+
expect(mockSSM.deleteParameter).toBeCalledTimes(2);
363403
});
364404

365-
it('fails to terminate', async () => {
366-
const errMsg = 'Error message';
367-
const runner: RunnerInfo = {
368-
awsRegion: Config.Instance.awsRegion,
369-
instanceId: '1234',
370-
};
371-
mockEC2.terminateInstances.mockClear().mockReturnValue({
372-
promise: jest.fn().mockRejectedValueOnce(Error(errMsg)),
405+
it('handles partial failure - terminates some runners but fails on others', async () => {
406+
const runners: RunnerInfo[] = [
407+
{
408+
awsRegion: 'us-east-1',
409+
instanceId: 'i-1234',
410+
environment: 'gi-ci',
411+
},
412+
{
413+
awsRegion: 'us-east-1',
414+
instanceId: 'i-5678',
415+
environment: 'gi-ci',
416+
},
417+
{
418+
awsRegion: 'us-west-2',
419+
instanceId: 'i-9999',
420+
environment: 'gi-ci',
421+
},
422+
];
423+
424+
// First region succeeds
425+
mockSSMdescribeParametersRet.mockResolvedValueOnce({
426+
Parameters: [{ Name: 'gi-ci-i-1234' }, { Name: 'gi-ci-i-5678' }],
373427
});
374-
expect(terminateRunner(runner, metrics)).rejects.toThrowError(errMsg);
375-
expect(mockSSM.describeParameters).not.toBeCalled();
376-
expect(mockSSM.deleteParameter).not.toBeCalled();
428+
429+
// Second region also gets SSM parameters but has no successful terminations to clean up
430+
mockSSMdescribeParametersRet.mockResolvedValueOnce({
431+
Parameters: [],
432+
});
433+
434+
// First region succeeds, second region fails
435+
mockEC2.terminateInstances
436+
.mockReturnValueOnce({
437+
promise: jest.fn().mockResolvedValueOnce({}),
438+
})
439+
.mockReturnValueOnce({
440+
promise: jest.fn().mockRejectedValueOnce(new Error('Region failure')),
441+
});
442+
443+
await expect(terminateRunners(runners, metrics)).rejects.toThrow(
444+
'Failed to terminate some runners: Instance i-9999: Region failure',
445+
);
446+
447+
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
448+
expect(mockSSM.describeParameters).toBeCalledTimes(2); // Called for both regions
449+
expect(mockSSM.deleteParameter).toBeCalledTimes(2); // Only for successful region
377450
});
378451

379-
it('fails to list parameters on terminate, then force delete all next parameters', async () => {
380-
const runner1: RunnerInfo = {
381-
awsRegion: Config.Instance.awsRegion,
382-
instanceId: '1234',
383-
environment: 'environ',
384-
};
385-
const runner2: RunnerInfo = {
386-
awsRegion: Config.Instance.awsRegion,
387-
instanceId: '1235',
388-
environment: 'environ',
389-
};
390-
mockSSMdescribeParametersRet.mockRejectedValueOnce('Some Error');
391-
await terminateRunner(runner1, metrics);
392-
await terminateRunner(runner2, metrics);
452+
it('handles large batches by splitting into chunks', async () => {
453+
// Create 150 runners to test batching (should split into 2 batches of 100 and 50)
454+
const runners: RunnerInfo[] = Array.from({ length: 150 }, (_, i) => ({
455+
awsRegion: 'us-east-1',
456+
instanceId: `i-${i.toString().padStart(4, '0')}`,
457+
environment: 'gi-ci',
458+
}));
393459

460+
mockSSMdescribeParametersRet.mockResolvedValueOnce({
461+
Parameters: runners.map((runner) => ({
462+
Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
463+
})),
464+
});
465+
466+
await terminateRunners(runners, metrics);
467+
468+
// Should make 2 terminate calls (batches of 100 and 50)
394469
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
470+
expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(1, {
471+
InstanceIds: runners.slice(0, 100).map((r) => r.instanceId),
472+
});
473+
expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(2, {
474+
InstanceIds: runners.slice(100, 150).map((r) => r.instanceId),
475+
});
476+
477+
// SSM cleanup should handle all 150 parameters
395478
expect(mockSSM.describeParameters).toBeCalledTimes(1);
396-
expect(mockSSM.deleteParameter).toBeCalledTimes(2);
397-
expect(mockSSM.deleteParameter).toBeCalledWith({
398-
Name: getParameterNameForRunner(runner1.environment as string, runner1.instanceId),
479+
expect(mockSSM.deleteParameter).toBeCalledTimes(150);
480+
});
481+
482+
it('cleans up SSM parameters for successful batches even when later batch fails', async () => {
483+
// Create runners that will be split into 2 batches
484+
const runners: RunnerInfo[] = Array.from({ length: 150 }, (_, i) => ({
485+
awsRegion: 'us-east-1',
486+
instanceId: `i-${i.toString().padStart(4, '0')}`,
487+
environment: 'gi-ci',
488+
}));
489+
490+
mockSSMdescribeParametersRet.mockResolvedValueOnce({
491+
Parameters: runners.slice(0, 100).map((runner) => ({
492+
Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
493+
})),
399494
});
495+
496+
// First batch succeeds, second batch fails
497+
mockEC2.terminateInstances
498+
.mockReturnValueOnce({
499+
promise: jest.fn().mockResolvedValueOnce({}),
500+
})
501+
.mockReturnValueOnce({
502+
promise: jest.fn().mockRejectedValueOnce(new Error('Batch 2 failed')),
503+
});
504+
505+
await expect(terminateRunners(runners, metrics)).rejects.toThrow('Failed to terminate some runners');
506+
507+
expect(mockEC2.terminateInstances).toBeCalledTimes(2);
508+
// SSM cleanup should still happen for the first 100 runners that were successfully terminated
509+
expect(mockSSM.describeParameters).toBeCalledTimes(1);
510+
expect(mockSSM.deleteParameter).toBeCalledTimes(100);
511+
});
512+
513+
it('handles SSM parameter cleanup failure gracefully', async () => {
514+
const runners: RunnerInfo[] = [
515+
{
516+
awsRegion: 'us-east-1',
517+
instanceId: 'i-1234',
518+
environment: 'gi-ci',
519+
},
520+
];
521+
522+
// SSM describe fails, so it should attempt direct deletion
523+
mockSSMdescribeParametersRet.mockRejectedValueOnce(new Error('SSM describe failed'));
524+
525+
await terminateRunners(runners, metrics);
526+
527+
expect(mockEC2.terminateInstances).toBeCalledTimes(1);
528+
expect(mockSSM.describeParameters).toBeCalledTimes(1);
529+
// Should still attempt direct deletion even when describe fails
530+
expect(mockSSM.deleteParameter).toBeCalledTimes(1);
400531
expect(mockSSM.deleteParameter).toBeCalledWith({
401-
Name: getParameterNameForRunner(runner2.environment as string, runner2.instanceId),
532+
Name: getParameterNameForRunner(runners[0].environment as string, runners[0].instanceId),
402533
});
403534
});
404535
});
@@ -1625,3 +1756,44 @@ describe('createRunner', () => {
16251756
});
16261757
});
16271758
});
1759+
1760+
describe('terminateRunner', () => {
1761+
beforeEach(() => {
1762+
mockSSMdescribeParametersRet.mockClear();
1763+
mockEC2.terminateInstances.mockClear();
1764+
mockSSM.deleteParameter.mockClear();
1765+
const config = {
1766+
environment: 'gi-ci',
1767+
minimumRunningTimeInMinutes: 45,
1768+
};
1769+
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => config as unknown as Config);
1770+
1771+
resetRunnersCaches();
1772+
});
1773+
1774+
it('delegates to terminateRunners with single runner array', async () => {
1775+
const runner: RunnerInfo = {
1776+
awsRegion: 'us-east-1',
1777+
instanceId: 'i-1234',
1778+
environment: 'gi-ci',
1779+
};
1780+
1781+
// Mock terminateRunners by mocking the underlying calls
1782+
mockSSMdescribeParametersRet.mockResolvedValueOnce({
1783+
Parameters: [{ Name: 'gi-ci-i-1234' }],
1784+
});
1785+
mockEC2.terminateInstances.mockReturnValueOnce({
1786+
promise: jest.fn().mockResolvedValueOnce({}),
1787+
});
1788+
1789+
await terminateRunner(runner, metrics);
1790+
1791+
// Verify the calls match what terminateRunners would do with a single runner
1792+
expect(mockEC2.terminateInstances).toBeCalledWith({
1793+
InstanceIds: ['i-1234'],
1794+
});
1795+
expect(mockSSM.deleteParameter).toBeCalledWith({
1796+
Name: 'gi-ci-i-1234',
1797+
});
1798+
});
1799+
});

0 commit comments

Comments
 (0)