feat: add support to use custom scale errors (#4932)

edersonbrilhante · github-actions[bot] · web-flow · commit 5285cdc768b0 · 2026-01-06T21:35:01.000+01:00
## Summary This PR makes the list of EC2 scaling error codes configurable instead of hardcoded in the control-plane Lambda. It allows users to extend or override the default retryable error set without forcing a change on everyone. ## Motivation Issue [#4105] was closed without a PR, leaving the scale error list hardcoded. Different environments can encounter additional EC2 error codes that should trigger retries; making this list configurable lets users adapt behavior without modifying the library code. --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -215,6 +215,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh)
 | <a name="input_runners_scale_up_lambda_timeout"></a> [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no |
 | <a name="input_runners_ssm_housekeeper"></a> [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.<br/><br/>  `schedule_expression`: is used to configure the schedule for the lambda.<br/>  `enabled`: enable or disable the lambda trigger via the EventBridge.<br/>  `lambda_memory_size`: lambda memory size limit.<br/>  `lambda_timeout`: timeout for the lambda in seconds.<br/>  `config`: configuration for the lambda function. Token path will be read by default from the module. | <pre>object({<br/>    schedule_expression = optional(string, "rate(1 day)")<br/>    enabled             = optional(bool, true)<br/>    lambda_memory_size  = optional(number, 512)<br/>    lambda_timeout      = optional(number, 60)<br/>    config = object({<br/>      tokenPath      = optional(string)<br/>      minimumDaysOld = optional(number, 1)<br/>      dryRun         = optional(bool, false)<br/>    })<br/>  })</pre> | <pre>{<br/>  "config": {}<br/>}</pre> | no |
 | <a name="input_scale_down_schedule_expression"></a> [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
+| <a name="input_scale_errors"></a> [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` | <pre>[<br/>  "UnfulfillableCapacity",<br/>  "MaxSpotInstanceCountExceeded",<br/>  "TargetCapacityLimitExceededException",<br/>  "RequestLimitExceeded",<br/>  "ResourceLimitExceeded",<br/>  "MaxSpotInstanceCountExceeded",<br/>  "MaxSpotFleetRequestCountExceeded",<br/>  "InsufficientInstanceCapacity",<br/>  "InsufficientCapacityOnHost"<br/>]</pre> | no |
 | <a name="input_scale_up_reserved_concurrent_executions"></a> [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
 | <a name="input_ssm_paths"></a> [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. | <pre>object({<br/>    root       = optional(string, "github-action-runners")<br/>    app        = optional(string, "app")<br/>    runners    = optional(string, "runners")<br/>    webhook    = optional(string, "webhook")<br/>    use_prefix = optional(bool, true)<br/>  })</pre> | `{}` | no |
 | <a name="input_state_event_rule_binaries_syncer"></a> [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no |
diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  scaleErrors: string[];
 }
diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts
@@ -429,6 +429,7 @@ describe('create runner with errors', () => {
     allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
     capacityType: 'spot',
     type: 'Repo',
+    scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded'],
   };
   const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = {
     type: 'Repo',
@@ -699,6 +700,7 @@ interface RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  scaleErrors: string[];
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -718,6 +720,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
     tracingEnabled: runnerConfig.tracingEnabled,
     onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
+    scaleErrors: runnerConfig.scaleErrors,
   };
 }
 
diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts
@@ -196,18 +196,7 @@ async function processFleetResult(
     return instances;
   }
 
-  // Educated guess of errors that would make sense to retry based on the list
-  // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
-  const scaleErrors = [
-    'UnfulfillableCapacity',
-    'MaxSpotInstanceCountExceeded',
-    'TargetCapacityLimitExceededException',
-    'RequestLimitExceeded',
-    'ResourceLimitExceeded',
-    'MaxSpotInstanceCountExceeded',
-    'MaxSpotFleetRequestCountExceeded',
-    'InsufficientInstanceCapacity',
-  ];
+  const scaleErrors = runnerParameters.scaleErrors;
 
   const failedCount = countScaleErrors(errors, scaleErrors);
   if (failedCount > 0) {
diff --git a/lambdas/functions/control-plane/src/modules.d.ts b/lambdas/functions/control-plane/src/modules.d.ts
@@ -3,6 +3,7 @@ declare namespace NodeJS {
     AWS_REGION: string;
     ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string;
     ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string;
+    SCALE_ERRORS: string;
     ENVIRONMENT: string;
     GHES_URL: string;
     JOB_RETRY_CONFIG: string;
diff --git a/lambdas/functions/control-plane/src/pool/pool.test.ts b/lambdas/functions/control-plane/src/pool/pool.test.ts
@@ -140,6 +140,8 @@ beforeEach(() => {
   process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
   process.env.RUNNER_OWNER = ORG;
   process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();
+  process.env.SCALE_ERRORS =
+    '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
 
   const mockTokenReturnValue = {
     data: {
diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts
@@ -41,6 +41,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
+  const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -95,6 +96,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
         amiIdSsmParameterName,
         tracingEnabled,
         onDemandFailoverOnError,
+        scaleErrors,
       },
       topUp,
       githubInstallationClient,
diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts
@@ -105,6 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
   subnets: ['subnet-123'],
   tracingEnabled: false,
   onDemandFailoverOnError: [],
+  scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'],
 };
 let expectedRunnerParams: RunnerInputParameters;
 
@@ -122,6 +123,8 @@ function setDefaults() {
   process.env.INSTANCE_TYPES = 'm5.large';
   process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
   process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
+  process.env.SCALE_ERRORS =
+    '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
 }
 
 beforeEach(() => {
@@ -809,6 +812,16 @@ describe('scaleUp with public GH', () => {
       });
     });
 
+    it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
+      process.env.RUNNER_LABELS = 'label1,label2';
+      process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
+      await scaleUpModule.scaleUp(TEST_DATA);
+      expect(createRunner).toBeCalledWith({
+        ...expectedRunnerParams,
+        scaleErrors: ['RequestLimitExceeded'],
+      });
+    });
+
     it('creates a runner and ensure the group argument is ignored', async () => {
       process.env.RUNNER_LABELS = 'label1,label2';
       process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';
diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts
@@ -62,6 +62,7 @@ interface CreateEC2RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  scaleErrors: string[];
 }
 
 function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -255,6 +256,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
+  const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -431,6 +433,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
         amiIdSsmParameterName,
         tracingEnabled,
         onDemandFailoverOnError,
+        scaleErrors,
       },
       newRunners,
       githubInstallationClient,
diff --git a/main.tf b/main.tf
@@ -187,6 +187,7 @@ module "runners" {
   enable_jit_config                    = var.enable_jit_config
   enable_job_queued_check              = var.enable_job_queued_check
   enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
+  scale_errors                         = var.scale_errors
   disable_runner_autoupdate            = var.disable_runner_autoupdate
   enable_managed_runner_security_group = var.enable_managed_runner_security_group
   enable_runner_detailed_monitoring    = var.enable_runner_detailed_monitoring
diff --git a/modules/multi-runner/README.md b/modules/multi-runner/README.md
diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf
@@ -32,6 +32,7 @@ module "runners" {
   github_app_parameters                = local.github_app_parameters
   ebs_optimized                        = each.value.runner_config.ebs_optimized
   enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
+  scale_errors                         = each.value.runner_config.scale_errors
   enable_organization_runners          = each.value.runner_config.enable_organization_runners
   enable_ephemeral_runners             = each.value.runner_config.enable_ephemeral_runners
   enable_jit_config                    = each.value.runner_config.enable_jit_config
diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf
@@ -71,14 +71,25 @@ variable "multi_runner_config" {
         id_ssm_parameter_arn = optional(string, null)
         kms_key_arn          = optional(string, null)
       }), null)
-      create_service_linked_role_spot         = optional(bool, false)
-      credit_specification                    = optional(string, null)
-      delay_webhook_event                     = optional(number, 30)
-      disable_runner_autoupdate               = optional(bool, false)
-      ebs_optimized                           = optional(bool, false)
-      enable_ephemeral_runners                = optional(bool, false)
-      enable_job_queued_check                 = optional(bool, null)
-      enable_on_demand_failover_for_errors    = optional(list(string), [])
+      create_service_linked_role_spot      = optional(bool, false)
+      credit_specification                 = optional(string, null)
+      delay_webhook_event                  = optional(number, 30)
+      disable_runner_autoupdate            = optional(bool, false)
+      ebs_optimized                        = optional(bool, false)
+      enable_ephemeral_runners             = optional(bool, false)
+      enable_job_queued_check              = optional(bool, null)
+      enable_on_demand_failover_for_errors = optional(list(string), [])
+      scale_errors = optional(list(string), [
+        "UnfulfillableCapacity",
+        "MaxSpotInstanceCountExceeded",
+        "TargetCapacityLimitExceededException",
+        "RequestLimitExceeded",
+        "ResourceLimitExceeded",
+        "MaxSpotInstanceCountExceeded",
+        "MaxSpotFleetRequestCountExceeded",
+        "InsufficientInstanceCapacity",
+        "InsufficientCapacityOnHost",
+      ])
       enable_organization_runners             = optional(bool, false)
       enable_runner_binaries_syncer           = optional(bool, true)
       enable_ssm_on_runners                   = optional(bool, false)
@@ -197,6 +208,7 @@ variable "multi_runner_config" {
         enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
         enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
         enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
+        scale_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
         enable_organization_runners: "Register runners to organization, instead of repo level"
         enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
         enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
diff --git a/modules/runners/README.md b/modules/runners/README.md
@@ -221,6 +221,7 @@ yarn run dist
 | <a name="input_runners_maximum_count"></a> [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check. | `number` | `3` | no |
 | <a name="input_s3_runner_binaries"></a> [s3\_runner\_binaries](#input\_s3\_runner\_binaries) | Bucket details for cached GitHub binary. | <pre>object({<br/>    arn = string<br/>    id  = string<br/>    key = string<br/>  })</pre> | n/a | yes |
 | <a name="input_scale_down_schedule_expression"></a> [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
+| <a name="input_scale_errors"></a> [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` | <pre>[<br/>  "UnfulfillableCapacity",<br/>  "MaxSpotInstanceCountExceeded",<br/>  "TargetCapacityLimitExceededException",<br/>  "RequestLimitExceeded",<br/>  "ResourceLimitExceeded",<br/>  "MaxSpotInstanceCountExceeded",<br/>  "MaxSpotFleetRequestCountExceeded",<br/>  "InsufficientInstanceCapacity",<br/>  "InsufficientCapacityOnHost"<br/>]</pre> | no |
 | <a name="input_scale_up_reserved_concurrent_executions"></a> [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
 | <a name="input_sqs_build_queue"></a> [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. | <pre>object({<br/>    arn = string<br/>    url = string<br/>  })</pre> | n/a | yes |
 | <a name="input_ssm_housekeeper"></a> [ssm\_housekeeper](#input\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.<br/><br/>  `schedule_expression`: is used to configure the schedule for the lambda.<br/>  `state`: state of the cloudwatch event rule. Valid values are `DISABLED`, `ENABLED`, and `ENABLED_WITH_ALL_CLOUDTRAIL_MANAGEMENT_EVENTS`.<br/>  `lambda_memory_size`: lambda memory size limit.<br/>  `lambda_timeout`: timeout for the lambda in seconds.<br/>  `config`: configuration for the lambda function. Token path will be read by default from the module. | <pre>object({<br/>    schedule_expression = optional(string, "rate(1 day)")<br/>    state               = optional(string, "ENABLED")<br/>    lambda_memory_size  = optional(number, 512)<br/>    lambda_timeout      = optional(number, 60)<br/>    config = object({<br/>      tokenPath      = optional(string)<br/>      minimumDaysOld = optional(number, 1)<br/>      dryRun         = optional(bool, false)<br/>    })<br/>  })</pre> | <pre>{<br/>  "config": {}<br/>}</pre> | no |
diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf
@@ -42,6 +42,7 @@ module "pool" {
       ephemeral                            = var.enable_ephemeral_runners
       enable_jit_config                    = var.enable_jit_config
       enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
+      scale_errors                         = var.scale_errors
       boot_time_in_minutes                 = var.runner_boot_time_in_minutes
       labels                               = var.runner_labels
       launch_template                      = aws_launch_template.runner
diff --git a/modules/runners/pool/README.md b/modules/runners/pool/README.md
diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf
diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf
diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf
diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf
diff --git a/variables.tf b/variables.tf

Original file line number	Diff line number	Diff line change
`@@ -44,4 +44,5 @@ export interface RunnerInputParameters {`
`44`	`44`	`amiIdSsmParameterName?: string;`
`45`	`45`	`tracingEnabled?: boolean;`
`46`	`46`	`onDemandFailoverOnError?: string[];`
	`47`	`+ scaleErrors: string[];`
`47`	`48`	`}`