Skip to content

Commit ac70214

Browse files
committed
feat(lambda): publish retry message from scale up if runner creation fails
1 parent 63ed8b6 commit ac70214

File tree

2 files changed

+221
-38
lines changed

2 files changed

+221
-38
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import ScaleError from './ScaleError';
1313
import * as scaleUpModule from './scale-up';
1414
import { getParameter } from '@aws-github-runner/aws-ssm-util';
1515
import { describe, it, expect, beforeEach, vi } from 'vitest';
16+
import { publishRetryMessage } from './job-retry';
1617

1718
const mockOctokit = {
1819
paginate: vi.fn(),
@@ -33,6 +34,7 @@ const mockCreateRunner = vi.mocked(createRunner);
3334
const mockListRunners = vi.mocked(listEC2Runners);
3435
const mockSSMClient = mockClient(SSMClient);
3536
const mockSSMgetParameter = vi.mocked(getParameter);
37+
const mockPublishRetryMessage = vi.mocked(publishRetryMessage);
3638

3739
vi.mock('@octokit/rest', () => ({
3840
Octokit: vi.fn().mockImplementation(() => mockOctokit),
@@ -60,6 +62,10 @@ vi.mock('@aws-github-runner/aws-ssm-util', async () => {
6062
};
6163
});
6264

65+
vi.mock('./job-retry', async () => ({
66+
publishRetryMessage: vi.fn(),
67+
}));
68+
6369
export type RunnerType = 'ephemeral' | 'non-ephemeral';
6470

6571
// for ephemeral and non-ephemeral runners
@@ -134,6 +140,9 @@ beforeEach(() => {
134140
owner: TEST_DATA.repositoryOwner,
135141
},
136142
]);
143+
mockPublishRetryMessage.mockImplementation(async () => {
144+
return;
145+
});
137146

138147
mockedAppAuth.mockResolvedValue({
139148
type: 'app',
@@ -328,6 +337,36 @@ describe('scaleUp with GHES', () => {
328337
],
329338
});
330339
});
340+
341+
it('tries to publish a retry message when runner creation succeeds', async () => {
342+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
343+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
344+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
345+
});
346+
347+
it('tries to publish a retry message when runner creation fails', async () => {
348+
const mockCreateRunners = vi.mocked(createRunner);
349+
mockCreateRunners.mockRejectedValue(new Error('no retry'));
350+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow();
351+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
352+
});
353+
354+
it('tries to publish a retry message when maximum runners has been reached', async () => {
355+
process.env.RUNNERS_MAXIMUM_COUNT = '1';
356+
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';
357+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
358+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
359+
});
360+
361+
it('does not publish a retry message when the job is not queued', async () => {
362+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
363+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
364+
data: { status: 'completed' },
365+
}));
366+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
367+
expect(publishRetryMessage).not.toBeCalled();
368+
});
369+
331370
it.each(RUNNER_TYPES)(
332371
'calls create start runner config of 40' + ' instances (ssm rate limit condition) to test time delay ',
333372
async (type: RunnerType) => {
@@ -456,6 +495,34 @@ describe('scaleUp with GHES', () => {
456495
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow('no retry');
457496
mockCreateRunners.mockReset();
458497
});
498+
499+
it('tries to publish a retry message when runner creation succeeds', async () => {
500+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
501+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
502+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
503+
});
504+
505+
it('tries to publish a retry message when runner creation fails', async () => {
506+
const mockCreateRunners = vi.mocked(createRunner);
507+
mockCreateRunners.mockRejectedValue(new Error('no retry'));
508+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow();
509+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
510+
});
511+
512+
it('tries to publish a retry message when maximum runners has been reached', async () => {
513+
process.env.RUNNERS_MAXIMUM_COUNT = '1';
514+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
515+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
516+
});
517+
518+
it('does not publish a retry message when the job is not queued', async () => {
519+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
520+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
521+
data: { status: 'completed' },
522+
}));
523+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
524+
expect(publishRetryMessage).not.toBeCalled();
525+
});
459526
});
460527
});
461528

@@ -530,6 +597,34 @@ describe('scaleUp with public GH', () => {
530597
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
531598
expect(createRunner).toBeCalledWith(expectedRunnerParams);
532599
});
600+
601+
it('tries to publish a retry message when runner creation succeeds', async () => {
602+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
603+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
604+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
605+
});
606+
607+
it('tries to publish a retry message when runner creation fails', async () => {
608+
const mockCreateRunners = vi.mocked(createRunner);
609+
mockCreateRunners.mockRejectedValue(new Error('no retry'));
610+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow();
611+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
612+
});
613+
614+
it('tries to publish a retry message when maximum runners has been reached', async () => {
615+
process.env.RUNNERS_MAXIMUM_COUNT = '1';
616+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
617+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
618+
});
619+
620+
it('does not publish a retry message when the job is not queued', async () => {
621+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
622+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
623+
data: { status: 'completed' },
624+
}));
625+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
626+
expect(publishRetryMessage).not.toBeCalled();
627+
});
533628
});
534629

535630
describe('on repo level', () => {
@@ -687,6 +782,34 @@ describe('scaleUp with public GH', () => {
687782
process.env.ENABLE_EPHEMERAL_RUNNERS = 'true';
688783
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toBeInstanceOf(ScaleError);
689784
});
785+
786+
it('tries to publish a retry message when runner creation succeeds', async () => {
787+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
788+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
789+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
790+
});
791+
792+
it('tries to publish a retry message when runner creation fails', async () => {
793+
const mockCreateRunners = vi.mocked(createRunner);
794+
mockCreateRunners.mockRejectedValue(new Error('no retry'));
795+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow();
796+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
797+
});
798+
799+
it('tries to publish a retry message when maximum runners has been reached', async () => {
800+
process.env.RUNNERS_MAXIMUM_COUNT = '1';
801+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
802+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
803+
});
804+
805+
it('does not publish a retry message when the job is not queued', async () => {
806+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
807+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
808+
data: { status: 'completed' },
809+
}));
810+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
811+
expect(publishRetryMessage).not.toBeCalled();
812+
});
690813
});
691814
});
692815

@@ -863,6 +986,36 @@ describe('scaleUp with Github Data Residency', () => {
863986
],
864987
});
865988
});
989+
990+
it('tries to publish a retry message when runner creation succeeds', async () => {
991+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
992+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
993+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
994+
});
995+
996+
it('tries to publish a retry message when runner creation fails', async () => {
997+
const mockCreateRunners = vi.mocked(createRunner);
998+
mockCreateRunners.mockRejectedValue(new Error('no retry'));
999+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow();
1000+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
1001+
});
1002+
1003+
it('tries to publish a retry message when maximum runners has been reached', async () => {
1004+
process.env.RUNNERS_MAXIMUM_COUNT = '1';
1005+
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';
1006+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
1007+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
1008+
});
1009+
1010+
it('does not publish a retry message when the job is not queued', async () => {
1011+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
1012+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
1013+
data: { status: 'completed' },
1014+
}));
1015+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
1016+
expect(publishRetryMessage).not.toBeCalled();
1017+
});
1018+
8661019
it.each(RUNNER_TYPES)(
8671020
'calls create start runner config of 40' + ' instances (ssm rate limit condition) to test time delay ',
8681021
async (type: RunnerType) => {
@@ -991,6 +1144,34 @@ describe('scaleUp with Github Data Residency', () => {
9911144
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow('no retry');
9921145
mockCreateRunners.mockReset();
9931146
});
1147+
1148+
it('tries to publish a retry message when runner creation succeeds', async () => {
1149+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
1150+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
1151+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
1152+
});
1153+
1154+
it('tries to publish a retry message when runner creation fails', async () => {
1155+
const mockCreateRunners = vi.mocked(createRunner);
1156+
mockCreateRunners.mockRejectedValue(new Error('no retry'));
1157+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toThrow();
1158+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
1159+
});
1160+
1161+
it('tries to publish a retry message when maximum runners has been reached', async () => {
1162+
process.env.RUNNERS_MAXIMUM_COUNT = '1';
1163+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
1164+
expect(publishRetryMessage).toBeCalledWith(TEST_DATA);
1165+
});
1166+
1167+
it('does not publish a retry message when the job is not queued', async () => {
1168+
process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
1169+
mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({
1170+
data: { status: 'completed' },
1171+
}));
1172+
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).resolves.not.toThrow();
1173+
expect(publishRetryMessage).not.toBeCalled();
1174+
});
9941175
});
9951176
});
9961177

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -305,46 +305,48 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
305305
scaleUp = currentRunners.length < maximumRunners;
306306
}
307307

308-
if (scaleUp) {
309-
logger.info(`Attempting to launch a new runner`);
310-
311-
await createRunners(
312-
{
313-
ephemeral,
314-
enableJitConfig,
315-
ghesBaseUrl,
316-
runnerLabels,
317-
runnerGroup,
318-
runnerNamePrefix,
319-
runnerOwner,
320-
runnerType,
321-
disableAutoUpdate,
322-
ssmTokenPath,
323-
ssmConfigPath,
324-
},
325-
{
326-
ec2instanceCriteria: {
327-
instanceTypes,
328-
targetCapacityType: instanceTargetCapacityType,
329-
maxSpotPrice: instanceMaxSpotPrice,
330-
instanceAllocationStrategy: instanceAllocationStrategy,
308+
try {
309+
if (scaleUp) {
310+
logger.info(`Attempting to launch a new runner`);
311+
312+
await createRunners(
313+
{
314+
ephemeral,
315+
enableJitConfig,
316+
ghesBaseUrl,
317+
runnerLabels,
318+
runnerGroup,
319+
runnerNamePrefix,
320+
runnerOwner,
321+
runnerType,
322+
disableAutoUpdate,
323+
ssmTokenPath,
324+
ssmConfigPath,
331325
},
332-
environment,
333-
launchTemplateName,
334-
subnets,
335-
amiIdSsmParameterName,
336-
tracingEnabled,
337-
onDemandFailoverOnError,
338-
},
339-
githubInstallationClient,
340-
);
341-
342-
await publishRetryMessage(payload);
343-
} else {
344-
logger.info('No runner will be created, maximum number of runners reached.');
345-
if (ephemeral) {
346-
throw new ScaleError('No runners create: maximum of runners reached.');
326+
{
327+
ec2instanceCriteria: {
328+
instanceTypes,
329+
targetCapacityType: instanceTargetCapacityType,
330+
maxSpotPrice: instanceMaxSpotPrice,
331+
instanceAllocationStrategy: instanceAllocationStrategy,
332+
},
333+
environment,
334+
launchTemplateName,
335+
subnets,
336+
amiIdSsmParameterName,
337+
tracingEnabled,
338+
onDemandFailoverOnError,
339+
},
340+
githubInstallationClient,
341+
);
342+
} else {
343+
logger.info('No runner will be created, maximum number of runners reached.');
344+
if (ephemeral) {
345+
throw new ScaleError('No runners create: maximum of runners reached.');
346+
}
347347
}
348+
} finally {
349+
await publishRetryMessage(payload);
348350
}
349351
} else {
350352
logger.info('No runner will be created, job is not queued.');

0 commit comments

Comments
 (0)