roverdotcom · JaredAtRover · Dec 19, 2024
diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts
@@ -2,13 +2,17 @@ import {
   CreateFleetCommand,
   CreateFleetResult,
   CreateTagsCommand,
+  DefaultTargetCapacityType,
   DescribeInstancesCommand,
   DescribeInstancesResult,
   EC2Client,
   FleetLaunchTemplateOverridesRequest,
   Tag,
   TerminateInstancesCommand,
+  StopInstancesCommand,
+  StartInstancesCommand,
   _InstanceType,
+  Filter,
 } from '@aws-sdk/client-ec2';
 import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
 import { getTracedAWSV3Client, tracer } from '@aws-github-runner/aws-powertools-util';
@@ -167,15 +171,12 @@ async function processFleetResult(
     );
     const errors = fleet.Errors?.flatMap((e) => e.ErrorCode || '') || [];
 
-    // Educated guess of errors that would make sense to retry based on the list
-    // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
     const scaleErrors = [
       'UnfulfillableCapacity',
       'MaxSpotInstanceCountExceeded',
       'TargetCapacityLimitExceededException',
       'RequestLimitExceeded',
       'ResourceLimitExceeded',
-      'MaxSpotInstanceCountExceeded',
       'MaxSpotFleetRequestCountExceeded',
       'InsufficientInstanceCapacity',
     ];
@@ -184,7 +185,7 @@ async function processFleetResult(
       errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) &&
       runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot'
     ) {
-      logger.warn(`Create fleet failed, initatiing fall back to on demand instances.`);
+      logger.warn(`Create fleet failed, initiating fall back to on demand instances.`);
       logger.debug('Create fleet failed.', { data: fleet.Errors });
       const numberOfInstances = runnerParameters.numberOfRunners - instances.length;
       const instancesOnDemand = await createRunner({
@@ -218,12 +219,10 @@ async function getAmiIdOverride(runnerParameters: Runners.RunnerInputParameters)
     return amiIdOverride;
   } catch (e) {
     logger.debug(
-      `Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}. ` +
-        'Please ensure that the given parameter exists on this region and contains a valid runner AMI ID',
+      `Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}.`,
       { error: e },
     );
-    throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName},
-       ${e}`);
+    throw new Error(`Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}, ${e}`);
   }
 }
 
@@ -244,54 +243,80 @@ async function createInstances(
     tags.push({ Key: 'ghr:trace_id', Value: traceId! });
   }
 
-  let fleet: CreateFleetResult;
-  try {
-    // see for spec https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html
-    const createFleetCommand = new CreateFleetCommand({
-      LaunchTemplateConfigs: [
-        {
-          LaunchTemplateSpecification: {
-            LaunchTemplateName: runnerParameters.launchTemplateName,
-            Version: '$Default',
-          },
-          Overrides: generateFleetOverrides(
-            runnerParameters.subnets,
-            runnerParameters.ec2instanceCriteria.instanceTypes,
-            amiIdOverride,
-          ),
+  const createFleetCommand = new CreateFleetCommand({
+    LaunchTemplateConfigs: [
+      {
+        LaunchTemplateSpecification: {
+          LaunchTemplateName: runnerParameters.launchTemplateName,
+          Version: '$Default',
         },
-      ],
-      SpotOptions: {
-        MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice,
-        AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy,
+        Overrides: generateFleetOverrides(
+          runnerParameters.subnets,
+          runnerParameters.ec2instanceCriteria.instanceTypes,
+          amiIdOverride,
+        ),
       },
-      TargetCapacitySpecification: {
-        TotalTargetCapacity: runnerParameters.numberOfRunners,
-        DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType,
+    ],
+    SpotOptions: {
+      MaxTotalPrice: runnerParameters.ec2instanceCriteria.maxSpotPrice,
+      AllocationStrategy: runnerParameters.ec2instanceCriteria.instanceAllocationStrategy,
+    },
+    TargetCapacitySpecification: {
+      TotalTargetCapacity: runnerParameters.numberOfRunners,
+      DefaultTargetCapacityType: runnerParameters.ec2instanceCriteria.targetCapacityType,
+    },
+    TagSpecifications: [
+      {
+        ResourceType: 'instance',
+        Tags: tags,
       },
-      TagSpecifications: [
-        {
-          ResourceType: 'instance',
-          Tags: tags,
-        },
-        {
-          ResourceType: 'volume',
-          Tags: tags,
-        },
-      ],
-      Type: 'instant',
-    });
-    fleet = await ec2Client.send(createFleetCommand);
-  } catch (e) {
-    logger.warn('Create fleet request failed.', { error: e as Error });
-    throw e;
-  }
-  return fleet;
+      {
+        ResourceType: 'volume',
+        Tags: tags,
+      },
+    ],
+    Type: 'instant',
+  });
+  return await ec2Client.send(createFleetCommand);
 }
 
-// If launchTime is undefined, this will return false
 export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean {
   const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
   const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
   return launchTimePlusBootTime < moment(new Date()).utc();
 }
+
+// New: Hibernate the runner instance
+export async function hibernateRunner(instanceId: string): Promise<void> {
+  logger.debug(`Runner '${instanceId}' will be hibernated (stopped with hibernation).`);
+  const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
+  // Note: Stopping an instance that supports hibernation and has hibernation configured will hibernate it.
+  await ec2.send(new StopInstancesCommand({ InstanceIds: [instanceId], Hibernate: true }));
+  await tag(instanceId, [{ Key: 'ghr:hibernated', Value: 'true' }]);
+  logger.debug(`Runner ${instanceId} has been hibernated.`);
+}
+
+// Helper function to resume hibernated instances
+export async function resumeHibernatedInstances(count: number): Promise<string[]> {
+  const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
+  const filters: Filter[] = [
+    { Name: 'instance-state-name', Values: ['stopped'] },
+    { Name: 'tag:ghr:hibernated', Values: ['true'] },
+    { Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
+  ];
+
+  const desc = await ec2.send(new DescribeInstancesCommand({ Filters: filters }));
+  const stoppedInstances = desc.Reservations?.flatMap(r => r.Instances?.map(i => i.InstanceId!) || []) || [];
+
+  const toResume = stoppedInstances.slice(0, count);
+  if (toResume.length > 0) {
+    logger.info(`Resuming hibernated instances: ${toResume.join(',')}`);
+    await ec2.send(new StartInstancesCommand({ InstanceIds: toResume }));
+    // Optionally remove hibernated tag after start
+    for (const instanceId of toResume) {
+      await tag(instanceId, [{Key: 'ghr:hibernated', Value: 'false'}]);
+    }
+  }
+
+  return toResume;
+}
diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts
@@ -3,11 +3,12 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
 import moment from 'moment';
 
 import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth';
-import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners';
-import { RunnerInfo, RunnerList } from './../aws/runners.d';
+import { bootTimeExceeded, hibernateRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners';
+import { RunnerInfo } from './../aws/runners.d';
 import { GhRunners, githubCache } from './cache';
 import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
 import { metricGitHubAppRateLimit } from '../github/rate-limit';
+import yn from 'yn';
 
 const logger = createChildLogger('scale-down');
 
@@ -103,39 +104,65 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
   return launchTimePlusMinimum < now;
 }
 
-async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
+async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[], ephemeral: boolean): Promise<void> {
   const githubAppClient = await getOrCreateOctokit(ec2runner);
   try {
     const states = await Promise.all(
       ghRunnerIds.map(async (ghRunnerId) => {
-        // Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
         return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
       }),
     );
 
     if (states.every((busy) => busy === false)) {
-      const statuses = await Promise.all(
-        ghRunnerIds.map(async (ghRunnerId) => {
-          return (
-            ec2runner.type === 'Org'
-              ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
-                  runner_id: ghRunnerId,
-                  org: ec2runner.owner,
-                })
-              : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
-                  runner_id: ghRunnerId,
-                  owner: ec2runner.owner.split('/')[0],
-                  repo: ec2runner.owner.split('/')[1],
-                })
-          ).status;
-        }),
-      );
-
-      if (statuses.every((status) => status == 204)) {
-        await terminateRunner(ec2runner.instanceId);
-        logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
+      // If ephemeral, still terminate. If not ephemeral, hibernate.
+      if (ephemeral) {
+        const statuses = await Promise.all(
+          ghRunnerIds.map(async (ghRunnerId) => {
+            return (
+              ec2runner.type === 'Org'
+                ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
+                    runner_id: ghRunnerId,
+                    org: ec2runner.owner,
+                  })
+                : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
+                    runner_id: ghRunnerId,
+                    owner: ec2runner.owner.split('/')[0],
+                    repo: ec2runner.owner.split('/')[1],
+                  })
+            ).status;
+          }),
+        );
+
+        if (statuses.every((status) => status == 204)) {
+          await terminateRunner(ec2runner.instanceId);
+          logger.debug(`AWS runner instance '${ec2runner.instanceId}' terminated (ephemeral runner).`);
+        } else {
+          logger.error(`Failed to de-register GitHub runner: ${statuses}`);
+        }
       } else {
-        logger.error(`Failed to de-register GitHub runner: ${statuses}`);
+        // Non-ephemeral runner: Hibernate
+        const statuses = await Promise.all(
+          ghRunnerIds.map(async (ghRunnerId) => {
+            return (
+              ec2runner.type === 'Org'
+                ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
+                    runner_id: ghRunnerId,
+                    org: ec2runner.owner,
+                  })
+                : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
+                    runner_id: ghRunnerId,
+                    owner: ec2runner.owner.split('/')[0],
+                    repo: ec2runner.owner.split('/')[1],
+                  })
+            ).status;
+          }),
+        );
+        if (statuses.every((status) => status == 204)) {
+          await hibernateRunner(ec2runner.instanceId);
+          logger.debug(`AWS runner instance '${ec2runner.instanceId}' is hibernated (non-ephemeral runner).`);
+        } else {
+          logger.error(`Failed to de-register GitHub runner before hibernation: ${statuses}`);
+        }
       }
     } else {
       logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
@@ -155,6 +182,8 @@ async function evaluateAndRemoveRunners(
   const evictionStrategy = getEvictionStrategy(scaleDownConfigs);
   const ownerTags = new Set(ec2Runners.map((runner) => runner.owner));
 
+  const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false });
+
   for (const ownerTag of ownerTags) {
     const ec2RunnersFiltered = ec2Runners
       .filter((runner) => runner.owner === ownerTag)
@@ -172,16 +201,19 @@ async function evaluateAndRemoveRunners(
       logger.debug(
         `GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`,
       );
+      const ephemeral = ephemeralEnabled && process.env.ENABLE_EPHEMERAL_RUNNERS === 'true';
+
       if (ghRunnersFiltered.length) {
         if (runnerMinimumTimeExceeded(ec2Runner)) {
           if (idleCounter > 0) {
             idleCounter--;
             logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`);
           } else {
-            logger.info(`Terminating all non busy runners.`);
+            logger.info(`Terminating or hibernating non busy runners.`);
             await removeRunner(
               ec2Runner,
               ghRunnersFiltered.map((runner: { id: number }) => runner.id),
+              ephemeral,
             );
           }
         }
@@ -230,26 +262,18 @@ export function newestFirstStrategy(a: RunnerInfo, b: RunnerInfo): number {
   return oldestFirstStrategy(a, b) * -1;
 }
 
-async function listRunners(environment: string) {
-  return await listEC2Runners({
-    environment,
-  });
-}
-
-function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
-  return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
-}
-
 export async function scaleDown(): Promise<void> {
   githubCache.reset();
   const environment = process.env.ENVIRONMENT;
   const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
 
-  // first runners marked to be orphan.
+  // first terminate orphan runners
   await terminateOrphan(environment);
 
   // next scale down idle runners with respect to config and mark potential orphans
-  const ec2Runners = await listRunners(environment);
+  const ec2Runners = await listEC2Runners({
+    environment,
+  });
   const activeEc2RunnersCount = ec2Runners.length;
   logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
   logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
@@ -259,9 +283,9 @@ export async function scaleDown(): Promise<void> {
     return;
   }
 
-  const runners = filterRunners(ec2Runners);
+  const runners = ec2Runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan);
   await evaluateAndRemoveRunners(runners, scaleDownConfigs);
 
-  const activeEc2RunnersCountAfter = (await listRunners(environment)).length;
+  const activeEc2RunnersCountAfter = (await listEC2Runners({ environment })).length;
   logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`);
-}
+}