diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index dfe4d99fcf..d92bad2cc4 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -2,13 +2,17 @@ import { CreateFleetCommand, CreateFleetResult, CreateTagsCommand, + DefaultTargetCapacityType, DescribeInstancesCommand, DescribeInstancesResult, EC2Client, FleetLaunchTemplateOverridesRequest, Tag, TerminateInstancesCommand, + StopInstancesCommand, + StartInstancesCommand, _InstanceType, + Filter, } from '@aws-sdk/client-ec2'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import { getTracedAWSV3Client, tracer } from '@aws-github-runner/aws-powertools-util'; @@ -167,15 +171,12 @@ async function processFleetResult( ); const errors = fleet.Errors?.flatMap((e) => e.ErrorCode || '') || []; - // Educated guess of errors that would make sense to retry based on the list - // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html const scaleErrors = [ 'UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException', 'RequestLimitExceeded', 'ResourceLimitExceeded', - 'MaxSpotInstanceCountExceeded', 'MaxSpotFleetRequestCountExceeded', 'InsufficientInstanceCapacity', ]; @@ -184,7 +185,7 @@ async function processFleetResult( errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) && runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot' ) { - logger.warn(`Create fleet failed, initatiing fall back to on demand instances.`); + logger.warn(`Create fleet failed, initiating fall back to on demand instances.`); logger.debug('Create fleet failed.', { data: fleet.Errors }); const numberOfInstances = runnerParameters.numberOfRunners - instances.length; const instancesOnDemand = await createRunner({ @@ -218,12 +219,10 @@ async function getAmiIdOverride(runnerParameters: Runners.RunnerInputParameters) return amiIdOverride; } catch (e) { logger.debug( - `Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}. ` + - 'Please ensure that the given parameter exists on this region and contains a valid runner AMI ID', + `Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}.`, { error: e }, ); - throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}, - ${e}`); + throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}, ${e}`); } } @@ -244,54 +243,80 @@ async function createInstances( tags.push({ Key: 'ghr:trace_id', Value: traceId! }); } - let fleet: CreateFleetResult; - try { - // see for spec https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html - const createFleetCommand = new CreateFleetCommand({ - LaunchTemplateConfigs: [ - { - LaunchTemplateSpecification: { - LaunchTemplateName: runnerParameters.launchTemplateName, - Version: '$Default', - }, - Overrides: generateFleetOverrides( - runnerParameters.subnets, - runnerParameters.ec2instanceCriteria.instanceTypes, - amiIdOverride, - ), + const createFleetCommand = new CreateFleetCommand({ + LaunchTemplateConfigs: [ + { + LaunchTemplateSpecification: { + LaunchTemplateName: runnerParameters.launchTemplateName, + Version: '$Default', }, - ], - SpotOptions: { - MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice, - AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy, + Overrides: generateFleetOverrides( + runnerParameters.subnets, + runnerParameters.ec2instanceCriteria.instanceTypes, + amiIdOverride, + ), }, - TargetCapacitySpecification: { - TotalTargetCapacity: runnerParameters.numberOfRunners, - DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType, + ], + SpotOptions: { + MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice, + AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy, + }, + TargetCapacitySpecification: { + TotalTargetCapacity: runnerParameters.numberOfRunners, + DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType, + }, + TagSpecifications: [ + { + ResourceType: 'instance', + Tags: tags, }, - TagSpecifications: [ - { - ResourceType: 'instance', - Tags: tags, - }, - { - ResourceType: 'volume', - Tags: tags, - }, - ], - Type: 'instant', - }); - fleet = await ec2Client.send(createFleetCommand); - } catch (e) { - logger.warn('Create fleet request failed.', { error: e as Error }); - throw e; - } - return fleet; + { + ResourceType: 'volume', + Tags: tags, + }, + ], + Type: 'instant', + }); + return await ec2Client.send(createFleetCommand); } -// If launchTime is undefined, this will return false export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean { const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES; const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes'); return launchTimePlusBootTime < moment(new Date()).utc(); } + +// New: Hibernate the runner instance +export async function hibernateRunner(instanceId: string): Promise { + logger.debug(`Runner '${instanceId}' will be hibernated (stopped with hibernation).`); + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + // Note: Stopping an instance that supports hibernation and has hibernation configured will hibernate it. + await ec2.send(new StopInstancesCommand({ InstanceIds: [instanceId], Hibernate: true })); + await tag(instanceId, [{ Key: 'ghr:hibernated', Value: 'true' }]); + logger.debug(`Runner ${instanceId} has been hibernated.`); +} + +// Helper function to resume hibernated instances +export async function resumeHibernatedInstances(count: number): Promise { + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + const filters: Filter[] = [ + { Name: 'instance-state-name', Values: ['stopped'] }, + { Name: 'tag:ghr:hibernated', Values: ['true'] }, + { Name: 'tag:ghr:Application', Values: ['github-action-runner'] }, + ]; + + const desc = await ec2.send(new DescribeInstancesCommand({ Filters: filters })); + const stoppedInstances = desc.Reservations?.flatMap(r => r.Instances?.map(i => i.InstanceId!) || []) || []; + + const toResume = stoppedInstances.slice(0, count); + if (toResume.length > 0) { + logger.info(`Resuming hibernated instances: ${toResume.join(',')}`); + await ec2.send(new StartInstancesCommand({ InstanceIds: toResume })); + // Optionally remove hibernated tag after start + for (const instanceId of toResume) { + await tag(instanceId, [{Key: 'ghr:hibernated', Value: 'false'}]); + } + } + + return toResume; +} \ No newline at end of file diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index be7183f929..da5a64cbb9 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -3,11 +3,12 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; -import { RunnerInfo, RunnerList } from './../aws/runners.d'; +import { bootTimeExceeded, hibernateRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners'; +import { RunnerInfo } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; +import yn from 'yn'; const logger = createChildLogger('scale-down'); @@ -103,39 +104,65 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { return launchTimePlusMinimum < now; } -async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { +async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[], ephemeral: boolean): Promise { const githubAppClient = await getOrCreateOctokit(ec2runner); try { const states = await Promise.all( ghRunnerIds.map(async (ghRunnerId) => { - // Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition. return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); }), ); if (states.every((busy) => busy === false)) { - const statuses = await Promise.all( - ghRunnerIds.map(async (ghRunnerId) => { - return ( - ec2runner.type === 'Org' - ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ - runner_id: ghRunnerId, - org: ec2runner.owner, - }) - : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ - runner_id: ghRunnerId, - owner: ec2runner.owner.split('/')[0], - repo: ec2runner.owner.split('/')[1], - }) - ).status; - }), - ); - - if (statuses.every((status) => status == 204)) { - await terminateRunner(ec2runner.instanceId); - logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); + // If ephemeral, still terminate. If not ephemeral, hibernate. + if (ephemeral) { + const statuses = await Promise.all( + ghRunnerIds.map(async (ghRunnerId) => { + return ( + ec2runner.type === 'Org' + ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }) + : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner: ec2runner.owner.split('/')[0], + repo: ec2runner.owner.split('/')[1], + }) + ).status; + }), + ); + + if (statuses.every((status) => status == 204)) { + await terminateRunner(ec2runner.instanceId); + logger.debug(`AWS runner instance '${ec2runner.instanceId}' terminated (ephemeral runner).`); + } else { + logger.error(`Failed to de-register GitHub runner: ${statuses}`); + } } else { - logger.error(`Failed to de-register GitHub runner: ${statuses}`); + // Non-ephemeral runner: Hibernate + const statuses = await Promise.all( + ghRunnerIds.map(async (ghRunnerId) => { + return ( + ec2runner.type === 'Org' + ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }) + : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner: ec2runner.owner.split('/')[0], + repo: ec2runner.owner.split('/')[1], + }) + ).status; + }), + ); + if (statuses.every((status) => status == 204)) { + await hibernateRunner(ec2runner.instanceId); + logger.debug(`AWS runner instance '${ec2runner.instanceId}' is hibernated (non-ephemeral runner).`); + } else { + logger.error(`Failed to de-register GitHub runner before hibernation: ${statuses}`); + } } } else { logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); @@ -155,6 +182,8 @@ async function evaluateAndRemoveRunners( const evictionStrategy = getEvictionStrategy(scaleDownConfigs); const ownerTags = new Set(ec2Runners.map((runner) => runner.owner)); + const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); + for (const ownerTag of ownerTags) { const ec2RunnersFiltered = ec2Runners .filter((runner) => runner.owner === ownerTag) @@ -172,16 +201,19 @@ async function evaluateAndRemoveRunners( logger.debug( `GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`, ); + const ephemeral = ephemeralEnabled && process.env.ENABLE_EPHEMERAL_RUNNERS === 'true'; + if (ghRunnersFiltered.length) { if (runnerMinimumTimeExceeded(ec2Runner)) { if (idleCounter > 0) { idleCounter--; logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); } else { - logger.info(`Terminating all non busy runners.`); + logger.info(`Terminating or hibernating non busy runners.`); await removeRunner( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), + ephemeral, ); } } @@ -230,26 +262,18 @@ export function newestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number { return oldestFirstStrategy(a, b) * -1; } -async function listRunners(environment: string) { - return await listEC2Runners({ - environment, - }); -} - -function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] { - return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[]; -} - export async function scaleDown(): Promise { githubCache.reset(); const environment = process.env.ENVIRONMENT; const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig]; - // first runners marked to be orphan. + // first terminate orphan runners await terminateOrphan(environment); // next scale down idle runners with respect to config and mark potential orphans - const ec2Runners = await listRunners(environment); + const ec2Runners = await listEC2Runners({ + environment, + }); const activeEc2RunnersCount = ec2Runners.length; logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`); logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`); @@ -259,9 +283,9 @@ export async function scaleDown(): Promise { return; } - const runners = filterRunners(ec2Runners); + const runners = ec2Runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan); await evaluateAndRemoveRunners(runners, scaleDownConfigs); - const activeEc2RunnersCountAfter = (await listRunners(environment)).length; + const activeEc2RunnersCountAfter = (await listEC2Runners({ environment })).length; logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`); -} +} \ No newline at end of file diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index c21aa4c2f1..71eef30d99 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -4,11 +4,11 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { createRunner, listEC2Runners } from './../aws/runners'; -import { RunnerInputParameters } from './../aws/runners.d'; +import { createRunner, listEC2Runners, RunnerInputParameters } from './../aws/runners'; import ScaleError from './ScaleError'; import { publishRetryMessage } from './job-retry'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; +import { resumeHibernatedInstances } from '../aws/runners'; const logger = createChildLogger('scale-up'); @@ -62,31 +62,6 @@ interface CreateEC2RunnerConfig { onDemandFailoverOnError?: string[]; } -function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) { - const config = [ - `--url ${githubRunnerConfig.ghesBaseUrl ?? 'https://github.com'}/${githubRunnerConfig.runnerOwner}`, - `--token ${token}`, - ]; - - if (githubRunnerConfig.runnerLabels) { - config.push(`--labels ${githubRunnerConfig.runnerLabels}`.trim()); - } - - if (githubRunnerConfig.disableAutoUpdate) { - config.push('--disableupdate'); - } - - if (githubRunnerConfig.runnerType === 'Org' && githubRunnerConfig.runnerGroup !== undefined) { - config.push(`--runnergroup ${githubRunnerConfig.runnerGroup}`); - } - - if (githubRunnerConfig.ephemeral) { - config.push(`--ephemeral`); - } - - return config; -} - async function getGithubRunnerRegistrationToken(githubRunnerConfig: CreateGitHubRunnerConfig, ghClient: Octokit) { const registrationToken = githubRunnerConfig.runnerType === 'Org' @@ -113,116 +88,6 @@ function removeTokenFromLogging(config: string[]): string[] { return result; } -export async function getInstallationId( - ghesApiUrl: string, - enableOrgLevel: boolean, - payload: ActionRequestMessage, -): Promise { - if (payload.installationId !== 0) { - return payload.installationId; - } - - const ghAuth = await createGithubAppAuth(undefined, ghesApiUrl); - const githubClient = await createOctokitClient(ghAuth.token, ghesApiUrl); - return enableOrgLevel - ? ( - await githubClient.apps.getOrgInstallation({ - org: payload.repositoryOwner, - }) - ).data.id - : ( - await githubClient.apps.getRepoInstallation({ - owner: payload.repositoryOwner, - repo: payload.repositoryName, - }) - ).data.id; -} - -export async function isJobQueued(githubInstallationClient: Octokit, payload: ActionRequestMessage): Promise { - let isQueued = false; - if (payload.eventType === 'workflow_job') { - const jobForWorkflowRun = await githubInstallationClient.actions.getJobForWorkflowRun({ - job_id: payload.id, - owner: payload.repositoryOwner, - repo: payload.repositoryName, - }); - metricGitHubAppRateLimit(jobForWorkflowRun.headers); - isQueued = jobForWorkflowRun.data.status === 'queued'; - } else { - throw Error(`Event ${payload.eventType} is not supported`); - } - return isQueued; -} - -async function getRunnerGroupId(githubRunnerConfig: CreateGitHubRunnerConfig, ghClient: Octokit): Promise { - // if the runnerType is Repo, then runnerGroupId is default to 1 - let runnerGroupId: number | undefined = 1; - if (githubRunnerConfig.runnerType === 'Org' && githubRunnerConfig.runnerGroup !== undefined) { - let runnerGroup: string | undefined; - // check if runner group id is already stored in SSM Parameter Store and - // use it if it exists to avoid API call to GitHub - try { - runnerGroup = await getParameter( - `${githubRunnerConfig.ssmConfigPath}/runner-group/${githubRunnerConfig.runnerGroup}`, - ); - } catch (err) { - logger.debug('Handling error:', err as Error); - logger.warn( - `SSM Parameter "${githubRunnerConfig.ssmConfigPath}/runner-group/${githubRunnerConfig.runnerGroup}" - for Runner group ${githubRunnerConfig.runnerGroup} does not exist`, - ); - } - if (runnerGroup === undefined) { - // get runner group id from GitHub - runnerGroupId = await getRunnerGroupByName(ghClient, githubRunnerConfig); - // store runner group id in SSM - try { - await putParameter( - `${githubRunnerConfig.ssmConfigPath}/runner-group/${githubRunnerConfig.runnerGroup}`, - runnerGroupId.toString(), - false, - ); - } catch (err) { - logger.debug('Error storing runner group id in SSM Parameter Store', err as Error); - throw err; - } - } else { - runnerGroupId = parseInt(runnerGroup); - } - } - return runnerGroupId; -} - -async function getRunnerGroupByName(ghClient: Octokit, githubRunnerConfig: CreateGitHubRunnerConfig): Promise { - const runnerGroups: RunnerGroup[] = await ghClient.paginate(`GET /orgs/{org}/actions/runner-groups`, { - org: githubRunnerConfig.runnerOwner, - per_page: 100, - }); - const runnerGroupId = runnerGroups.find((runnerGroup) => runnerGroup.name === githubRunnerConfig.runnerGroup)?.id; - - if (runnerGroupId === undefined) { - throw new Error(`Runner group ${githubRunnerConfig.runnerGroup} does not exist`); - } - - return runnerGroupId; -} - -export async function createRunners( - githubRunnerConfig: CreateGitHubRunnerConfig, - ec2RunnerConfig: CreateEC2RunnerConfig, - ghClient: Octokit, -): Promise { - const instances = await createRunner({ - runnerType: githubRunnerConfig.runnerType, - runnerOwner: githubRunnerConfig.runnerOwner, - numberOfRunners: 1, - ...ec2RunnerConfig, - }); - if (instances.length !== 0) { - await createStartRunnerConfig(githubRunnerConfig, instances, ghClient); - } -} - export async function scaleUp(eventSource: string, payload: ActionRequestMessage): Promise { logger.info(`Received ${payload.eventType} from ${payload.repositoryOwner}/${payload.repositoryName}`); @@ -241,7 +106,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const disableAutoUpdate = yn(process.env.DISABLE_RUNNER_AUTOUPDATE, { default: false }); const launchTemplateName = process.env.LAUNCH_TEMPLATE_NAME; const instanceMaxSpotPrice = process.env.INSTANCE_MAX_SPOT_PRICE; - const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; // same as AWS default + const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; const enableJobQueuedCheck = yn(process.env.ENABLE_JOB_QUEUED_CHECK, { default: true }); const amiIdSsmParameterName = process.env.AMI_ID_SSM_PARAMETER_NAME; const runnerNamePrefix = process.env.RUNNER_NAME_PREFIX || ''; @@ -293,7 +158,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const githubInstallationClient = await createOctokitClient(ghAuth.token, ghesApiUrl); if (!enableJobQueuedCheck || (await isJobQueued(githubInstallationClient, payload))) { - let scaleUp = true; + let scaleUpNeeded = true; if (maximumRunners !== -1) { const currentRunners = await listEC2Runners({ environment, @@ -301,55 +166,66 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage runnerOwner, }); logger.info(`Current runners: ${currentRunners.length} of ${maximumRunners}`); - scaleUp = currentRunners.length < maximumRunners; + scaleUpNeeded = currentRunners.length < maximumRunners; } - if (scaleUp) { - logger.info(`Attempting to launch a new runner`); - - await createRunners( - { - ephemeral, - enableJitConfig, - ghesBaseUrl, - runnerLabels, - runnerGroup, - runnerNamePrefix, - runnerOwner, - runnerType, - disableAutoUpdate, - ssmTokenPath, - ssmConfigPath, - }, - { - ec2instanceCriteria: { - instanceTypes, - targetCapacityType: instanceTargetCapacityType, - maxSpotPrice: instanceMaxSpotPrice, - instanceAllocationStrategy: instanceAllocationStrategy, + if (scaleUpNeeded) { + logger.info(`Attempting to launch or resume a runner`); + + // Before creating a new instance, try resuming hibernated instances + const neededCount = 1; // In this logic we often scale one runner at a time for a single event, adjust as needed + const resumedInstances = await resumeHibernatedInstances(neededCount); + + if (resumedInstances.length < neededCount) { + // Not enough hibernated instances, create new runner + await createRunners( + { + ephemeral, + enableJitConfig, + ghesBaseUrl, + runnerLabels, + runnerGroup, + runnerNamePrefix, + runnerOwner, + runnerType, + disableAutoUpdate, + ssmTokenPath, + ssmConfigPath, }, - environment, - launchTemplateName, - subnets, - amiIdSsmParameterName, - tracingEnabled, - onDemandFailoverOnError, - }, - githubInstallationClient, - ); + { + ec2instanceCriteria: { + instanceTypes, + targetCapacityType: instanceTargetCapacityType, + maxSpotPrice: instanceMaxSpotPrice, + instanceAllocationStrategy: instanceAllocationStrategy, + }, + environment, + launchTemplateName, + subnets, + amiIdSsmParameterName, + tracingEnabled, + onDemandFailoverOnError, + }, + githubInstallationClient, + ); + } await publishRetryMessage(payload); } else { - logger.info('No runner will be created, maximum number of runners reached.'); + logger.info('No runner will be created or resumed, maximum number of runners reached.'); if (ephemeral) { throw new ScaleError('No runners create: maximum of runners reached.'); } } } else { - logger.info('No runner will be created, job is not queued.'); + logger.info('No runner will be created or resumed, job is not queued.'); } } +function isValidRepoOwnerTypeIfOrgLevelEnabled(payload: ActionRequestMessage, enableOrgLevel: boolean): boolean { + return !(enableOrgLevel && payload.repoOwnerType !== 'Organization'); +} + export function getGitHubEnterpriseApiUrl() { const ghesBaseUrl = process.env.GHES_URL; let ghesApiUrl = ''; @@ -359,27 +235,38 @@ export function getGitHubEnterpriseApiUrl() { return { ghesApiUrl, ghesBaseUrl }; } -async function createStartRunnerConfig( - githubRunnerConfig: CreateGitHubRunnerConfig, - instances: string[], - ghClient: Octokit, -) { - if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) { - await createJitConfig(githubRunnerConfig, instances, ghClient); - } else { - await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); - } -} +async function getRunnerGroupId(githubRunnerConfig: CreateGitHubRunnerConfig, ghClient: Octokit): Promise { + let runnerGroupId: number | undefined = 1; + if (githubRunnerConfig.runnerType === 'Org' && githubRunnerConfig.runnerGroup !== undefined) { + let runnerGroup: string | undefined; + try { + runnerGroup = await getParameter( + `${githubRunnerConfig.ssmConfigPath}/runner-group/${githubRunnerConfig.runnerGroup}`, + ); + } catch (err) { + logger.debug('SSM parameter for runner group not found, will fetch from GitHub.', err as Error); + } + if (runnerGroup === undefined) { + const runnerGroups: RunnerGroup[] = await ghClient.paginate(`GET /orgs/{org}/actions/runner-groups`, { + org: githubRunnerConfig.runnerOwner, + per_page: 100, + }); + runnerGroupId = runnerGroups.find((runnerGroup) => runnerGroup.name === githubRunnerConfig.runnerGroup)?.id; -function isValidRepoOwnerTypeIfOrgLevelEnabled(payload: ActionRequestMessage, enableOrgLevel: boolean): boolean { - return !(enableOrgLevel && payload.repoOwnerType !== 'Organization'); -} + if (runnerGroupId === undefined) { + throw new Error(`Runner group ${githubRunnerConfig.runnerGroup} does not exist`); + } -function addDelay(instances: string[]) { - const delay = async (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); - const ssmParameterStoreMaxThroughput = 40; - const isDelay = instances.length >= ssmParameterStoreMaxThroughput; - return { isDelay, delay }; + await putParameter( + `${githubRunnerConfig.ssmConfigPath}/runner-group/${githubRunnerConfig.runnerGroup}`, + runnerGroupId.toString(), + false, + ); + } else { + runnerGroupId = parseInt(runnerGroup); + } + } + return runnerGroupId; } async function createRegistrationTokenConfig( @@ -387,7 +274,6 @@ async function createRegistrationTokenConfig( instances: string[], ghClient: Octokit, ) { - const { isDelay, delay } = addDelay(instances); const token = await getGithubRunnerRegistrationToken(githubRunnerConfig, ghClient); const runnerServiceConfig = generateRunnerServiceConfig(githubRunnerConfig, token); @@ -399,28 +285,23 @@ async function createRegistrationTokenConfig( await putParameter(`${githubRunnerConfig.ssmTokenPath}/${instance}`, runnerServiceConfig.join(' '), true, { tags: [{ Key: 'InstanceId', Value: instance }], }); - if (isDelay) { - // Delay to prevent AWS ssm rate limits by being within the max throughput limit - await delay(25); - } } } -async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) { +async function createJitConfig( + githubRunnerConfig: CreateGitHubRunnerConfig, + instances: string[], + ghClient: Octokit, +) { const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); - const { isDelay, delay } = addDelay(instances); const runnerLabels = githubRunnerConfig.runnerLabels.split(','); - logger.debug(`Runner group id: ${runnerGroupId}`); - logger.debug(`Runner labels: ${runnerLabels}`); for (const instance of instances) { - // generate jit config for runner registration - const ephemeralRunnerConfig: EphemeralRunnerConfig = { + const ephemeralRunnerConfig = { runnerName: `${githubRunnerConfig.runnerNamePrefix}${instance}`, runnerGroupId: runnerGroupId, runnerLabels: runnerLabels, }; - logger.debug(`Runner name: ${ephemeralRunnerConfig.runnerName}`); const runnerConfig = githubRunnerConfig.runnerType === 'Org' ? await ghClient.actions.generateRunnerJitconfigForOrg({ @@ -439,16 +320,94 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins metricGitHubAppRateLimit(runnerConfig.headers); - // store jit config in ssm parameter store - logger.debug('Runner JIT config for ephemeral runner generated.', { - instance: instance, - }); await putParameter(`${githubRunnerConfig.ssmTokenPath}/${instance}`, runnerConfig.data.encoded_jit_config, true, { tags: [{ Key: 'InstanceId', Value: instance }], }); - if (isDelay) { - // Delay to prevent AWS ssm rate limits by being within the max throughput limit - await delay(25); + } +} + +export async function createRunners( + githubRunnerConfig: CreateGitHubRunnerConfig, + ec2RunnerConfig: CreateEC2RunnerConfig, + ghClient: Octokit, +): Promise { + const instances = await createRunner({ + runnerType: githubRunnerConfig.runnerType, + runnerOwner: githubRunnerConfig.runnerOwner, + numberOfRunners: ec2RunnerConfig.numberOfRunners || 1, + ...ec2RunnerConfig, + }); + if (instances.length !== 0) { + if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) { + await createJitConfig(githubRunnerConfig, instances, ghClient); + } else { + await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); } } } + +export async function getInstallationId( + ghesApiUrl: string, + enableOrgLevel: boolean, + payload: ActionRequestMessage, +): Promise { + if (payload.installationId !== 0) { + return payload.installationId; + } + + const ghAuth = await createGithubAppAuth(undefined, ghesApiUrl); + const githubClient = await createOctokitClient(ghAuth.token, ghesApiUrl); + return enableOrgLevel + ? ( + await githubClient.apps.getOrgInstallation({ + org: payload.repositoryOwner, + }) + ).data.id + : ( + await githubClient.apps.getRepoInstallation({ + owner: payload.repositoryOwner, + repo: payload.repositoryName, + }) + ).data.id; +} + +export async function isJobQueued(githubInstallationClient: Octokit, payload: ActionRequestMessage): Promise { + let isQueued = false; + if (payload.eventType === 'workflow_job') { + const jobForWorkflowRun = await githubInstallationClient.actions.getJobForWorkflowRun({ + job_id: payload.id, + owner: payload.repositoryOwner, + repo: payload.repositoryName, + }); + metricGitHubAppRateLimit(jobForWorkflowRun.headers); + isQueued = jobForWorkflowRun.data.status === 'queued'; + } else { + throw Error(`Event ${payload.eventType} is not supported`); + } + return isQueued; +} + +function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) { + const config = [ + `--url ${githubRunnerConfig.ghesBaseUrl ?? 'https://github.com'}/${githubRunnerConfig.runnerOwner}`, + `--token ${token}`, + ]; + + if (githubRunnerConfig.runnerLabels) { + config.push(`--labels ${githubRunnerConfig.runnerLabels}`.trim()); + } + + if (githubRunnerConfig.disableAutoUpdate) { + config.push('--disableupdate'); + } + + if (githubRunnerConfig.runnerType === 'Org' && githubRunnerConfig.runnerGroup !== undefined) { + config.push(`--runnergroup ${githubRunnerConfig.runnerGroup}`); + } + + if (githubRunnerConfig.ephemeral) { + config.push(`--ephemeral`); + } + + return config; +}