Skip to content

Commit 95aa6a2

Browse files
stuartp44Copilot
andauthored
fix: add handling for 404 errors in scale-down tests and improve error logging (#4726)
This PR helps with dealing with 404 issues when terminating runners and checking if the runner is actually there or not. At the moment, we don't recognize the fact 404 is actually a good response for runners that are no longer there. This means the rest of the logic is ignored to do last chance checking. This PR helps by returning null for a 404, hence able to act on this or the runner data if we get it. For all other logic, we still return the previous error. --------- Co-authored-by: Copilot <[email protected]>
1 parent 61e72dc commit 95aa6a2

File tree

2 files changed

+157
-20
lines changed

2 files changed

+157
-20
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { Octokit } from '@octokit/rest';
2+
import { RequestError } from '@octokit/request-error';
23
import moment from 'moment';
34
import nock from 'nock';
45

@@ -403,6 +404,121 @@ describe('Scale down runners', () => {
403404
expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId);
404405
});
405406

407+
it('Should handle 404 error when checking orphaned runner (JIT) - treat as orphaned', async () => {
408+
// arrange
409+
const orphanRunner = createRunnerTestData(
410+
'orphan-jit-404',
411+
type,
412+
MINIMUM_BOOT_TIME + 1,
413+
false,
414+
true,
415+
true, // should be terminated when 404
416+
undefined,
417+
1234567890,
418+
);
419+
const runners = [orphanRunner];
420+
421+
mockGitHubRunners([]);
422+
mockAwsRunners(runners);
423+
424+
// Mock 404 error response
425+
const error404 = new RequestError('Runner not found', 404, {
426+
request: {
427+
method: 'GET',
428+
url: 'https://api.github.com/test',
429+
headers: {},
430+
},
431+
});
432+
433+
if (type === 'Repo') {
434+
mockOctokit.actions.getSelfHostedRunnerForRepo.mockRejectedValueOnce(error404);
435+
} else {
436+
mockOctokit.actions.getSelfHostedRunnerForOrg.mockRejectedValueOnce(error404);
437+
}
438+
439+
// act
440+
await scaleDown();
441+
442+
// assert - should terminate since 404 means runner doesn't exist on GitHub
443+
expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId);
444+
});
445+
446+
it('Should handle 404 error when checking runner busy state - treat as not busy', async () => {
447+
// arrange
448+
const runner = createRunnerTestData(
449+
'runner-404',
450+
type,
451+
MINIMUM_TIME_RUNNING_IN_MINUTES + 1,
452+
true,
453+
false,
454+
true, // should be terminated since not busy due to 404
455+
);
456+
const runners = [runner];
457+
458+
mockGitHubRunners(runners);
459+
mockAwsRunners(runners);
460+
461+
// Mock 404 error response for busy state check
462+
const error404 = new RequestError('Runner not found', 404, {
463+
request: {
464+
method: 'GET',
465+
url: 'https://api.github.com/test',
466+
headers: {},
467+
},
468+
});
469+
470+
if (type === 'Repo') {
471+
mockOctokit.actions.getSelfHostedRunnerForRepo.mockRejectedValueOnce(error404);
472+
} else {
473+
mockOctokit.actions.getSelfHostedRunnerForOrg.mockRejectedValueOnce(error404);
474+
}
475+
476+
// act
477+
await scaleDown();
478+
479+
// assert - should terminate since 404 means runner is not busy
480+
checkTerminated(runners);
481+
});
482+
483+
it('Should re-throw non-404 errors when checking runner state', async () => {
484+
// arrange
485+
const orphanRunner = createRunnerTestData(
486+
'orphan-error',
487+
type,
488+
MINIMUM_BOOT_TIME + 1,
489+
false,
490+
true,
491+
false,
492+
undefined,
493+
1234567890,
494+
);
495+
const runners = [orphanRunner];
496+
497+
mockGitHubRunners([]);
498+
mockAwsRunners(runners);
499+
500+
// Mock non-404 error response
501+
const error500 = new RequestError('Internal server error', 500, {
502+
request: {
503+
method: 'GET',
504+
url: 'https://api.github.com/test',
505+
headers: {},
506+
},
507+
});
508+
509+
if (type === 'Repo') {
510+
mockOctokit.actions.getSelfHostedRunnerForRepo.mockRejectedValueOnce(error500);
511+
} else {
512+
mockOctokit.actions.getSelfHostedRunnerForOrg.mockRejectedValueOnce(error500);
513+
}
514+
515+
// act & assert - should not throw because error handling is in terminateOrphan
516+
await expect(scaleDown()).resolves.not.toThrow();
517+
518+
// Should not terminate since the error was not a 404
519+
expect(terminateRunner).not.toHaveBeenCalledWith(orphanRunner.instanceId);
520+
});
521+
406522
it(`Should ignore errors when termination orphan fails.`, async () => {
407523
// setup
408524
const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true);

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { Octokit } from '@octokit/rest';
22
import { Endpoints } from '@octokit/types';
3+
import { RequestError } from '@octokit/request-error';
34
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
45
import moment from 'moment';
56

@@ -55,25 +56,39 @@ async function getGitHubSelfHostedRunnerState(
5556
client: Octokit,
5657
ec2runner: RunnerInfo,
5758
runnerId: number,
58-
): Promise<RunnerState> {
59-
const state =
60-
ec2runner.type === 'Org'
61-
? await client.actions.getSelfHostedRunnerForOrg({
62-
runner_id: runnerId,
63-
org: ec2runner.owner,
64-
})
65-
: await client.actions.getSelfHostedRunnerForRepo({
66-
runner_id: runnerId,
67-
owner: ec2runner.owner.split('/')[0],
68-
repo: ec2runner.owner.split('/')[1],
69-
});
70-
metricGitHubAppRateLimit(state.headers);
71-
72-
return state.data;
59+
): Promise<RunnerState | null> {
60+
try {
61+
const state =
62+
ec2runner.type === 'Org'
63+
? await client.actions.getSelfHostedRunnerForOrg({
64+
runner_id: runnerId,
65+
org: ec2runner.owner,
66+
})
67+
: await client.actions.getSelfHostedRunnerForRepo({
68+
runner_id: runnerId,
69+
owner: ec2runner.owner.split('/')[0],
70+
repo: ec2runner.owner.split('/')[1],
71+
});
72+
metricGitHubAppRateLimit(state.headers);
73+
74+
return state.data;
75+
} catch (error) {
76+
if (error instanceof RequestError && error.status === 404) {
77+
logger.info(`Runner '${ec2runner.instanceId}' with GitHub Runner ID '${runnerId}' not found on GitHub (404)`);
78+
return null;
79+
}
80+
throw error;
81+
}
7382
}
7483

7584
async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise<boolean> {
7685
const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId);
86+
if (state === null) {
87+
logger.info(
88+
`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Not found on GitHub, treating as not busy`,
89+
);
90+
return false;
91+
}
7792
logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`);
7893
return state.busy;
7994
}
@@ -227,12 +242,18 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise<boolean>
227242
const ec2Instance = runner as RunnerInfo;
228243
const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId);
229244
let isOrphan = false;
230-
logger.debug(
231-
`Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`,
232-
);
233-
const isOfflineAndBusy = state.status === 'offline' && state.busy;
234-
if (isOfflineAndBusy) {
245+
246+
if (state === null) {
247+
logger.debug(`Runner '${runner.instanceId}' not found on GitHub, treating as orphaned.`);
235248
isOrphan = true;
249+
} else {
250+
logger.debug(
251+
`Runner '${runner.instanceId}' is '${state.status}' and is currently '${state.busy ? 'busy' : 'idle'}'.`,
252+
);
253+
const isOfflineAndBusy = state.status === 'offline' && state.busy;
254+
if (isOfflineAndBusy) {
255+
isOrphan = true;
256+
}
236257
}
237258
logger.info(`Runner '${runner.instanceId}' is judged to ${isOrphan ? 'be' : 'not be'} orphaned.`);
238259
return isOrphan;

0 commit comments

Comments
 (0)