diff --git a/README.md b/README.md
index 1ca2e82e22..7de101b517 100644
--- a/README.md
+++ b/README.md
@@ -215,6 +215,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh)
| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no |
| [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.
`schedule_expression`: is used to configure the schedule for the lambda.
`enabled`: enable or disable the lambda trigger via the EventBridge.
`lambda_memory_size`: lambda memory size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
}) | {
"config": {}
} | no |
| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
+| [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` | [| no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. |
"UnfulfillableCapacity",
"MaxSpotInstanceCountExceeded",
"TargetCapacityLimitExceededException",
"RequestLimitExceeded",
"ResourceLimitExceeded",
"MaxSpotInstanceCountExceeded",
"MaxSpotFleetRequestCountExceeded",
"InsufficientInstanceCapacity",
"InsufficientCapacityOnHost"
]
object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
use_prefix = optional(bool, true)
}) | `{}` | no |
| [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no |
diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts
index 72ff9e3e1a..3975d093f5 100644
--- a/lambdas/functions/control-plane/src/aws/runners.d.ts
+++ b/lambdas/functions/control-plane/src/aws/runners.d.ts
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
amiIdSsmParameterName?: string;
tracingEnabled?: boolean;
onDemandFailoverOnError?: string[];
+ scaleErrors: string[];
}
diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts
index c4fd922fd0..c3cc6d8487 100644
--- a/lambdas/functions/control-plane/src/aws/runners.test.ts
+++ b/lambdas/functions/control-plane/src/aws/runners.test.ts
@@ -429,6 +429,7 @@ describe('create runner with errors', () => {
allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
capacityType: 'spot',
type: 'Repo',
+ scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded'],
};
const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = {
type: 'Repo',
@@ -699,6 +700,7 @@ interface RunnerConfig {
amiIdSsmParameterName?: string;
tracingEnabled?: boolean;
onDemandFailoverOnError?: string[];
+ scaleErrors: string[];
}
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -718,6 +720,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
tracingEnabled: runnerConfig.tracingEnabled,
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
+ scaleErrors: runnerConfig.scaleErrors,
};
}
diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts
index d95dc99fa4..0f4fc5bee9 100644
--- a/lambdas/functions/control-plane/src/aws/runners.ts
+++ b/lambdas/functions/control-plane/src/aws/runners.ts
@@ -196,18 +196,7 @@ async function processFleetResult(
return instances;
}
- // Educated guess of errors that would make sense to retry based on the list
- // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
- const scaleErrors = [
- 'UnfulfillableCapacity',
- 'MaxSpotInstanceCountExceeded',
- 'TargetCapacityLimitExceededException',
- 'RequestLimitExceeded',
- 'ResourceLimitExceeded',
- 'MaxSpotInstanceCountExceeded',
- 'MaxSpotFleetRequestCountExceeded',
- 'InsufficientInstanceCapacity',
- ];
+ const scaleErrors = runnerParameters.scaleErrors;
const failedCount = countScaleErrors(errors, scaleErrors);
if (failedCount > 0) {
diff --git a/lambdas/functions/control-plane/src/modules.d.ts b/lambdas/functions/control-plane/src/modules.d.ts
index 7570f29035..ff447c0e51 100644
--- a/lambdas/functions/control-plane/src/modules.d.ts
+++ b/lambdas/functions/control-plane/src/modules.d.ts
@@ -3,6 +3,7 @@ declare namespace NodeJS {
AWS_REGION: string;
ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string;
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string;
+ SCALE_ERRORS: string;
ENVIRONMENT: string;
GHES_URL: string;
JOB_RETRY_CONFIG: string;
diff --git a/lambdas/functions/control-plane/src/pool/pool.test.ts b/lambdas/functions/control-plane/src/pool/pool.test.ts
index c05a8b8cb7..aaa6aea715 100644
--- a/lambdas/functions/control-plane/src/pool/pool.test.ts
+++ b/lambdas/functions/control-plane/src/pool/pool.test.ts
@@ -140,6 +140,8 @@ beforeEach(() => {
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
process.env.RUNNER_OWNER = ORG;
process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();
+ process.env.SCALE_ERRORS =
+ '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]';
const mockTokenReturnValue = {
data: {
diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts
index aa690e97f6..4da3e7355d 100644
--- a/lambdas/functions/control-plane/src/pool/pool.ts
+++ b/lambdas/functions/control-plane/src/pool/pool.ts
@@ -41,6 +41,7 @@ export async function adjust(event: PoolEvent): Promiseobject({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_github_app_rate_limit = optional(bool, true)
enable_job_retry = optional(bool, true)
enable_spot_termination_warning = optional(bool, true)
}), {})
}) | `{}` | no |
-| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null)
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
placement = optional(object({
affinity = optional(string)
availability_zone = optional(string)
group_id = optional(string)
group_name = optional(string)
host_id = optional(string)
host_resource_group_arn = optional(number)
spread_domain = optional(string)
tenancy = optional(string)
partition_number = optional(number)
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
})) | n/a | yes |
+| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null)
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
scale_errors = optional(list(string), [
"UnfulfillableCapacity",
"MaxSpotInstanceCountExceeded",
"TargetCapacityLimitExceededException",
"RequestLimitExceeded",
"ResourceLimitExceeded",
"MaxSpotInstanceCountExceeded",
"MaxSpotFleetRequestCountExceeded",
"InsufficientInstanceCapacity",
"InsufficientCapacityOnHost",
])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
placement = optional(object({
affinity = optional(string)
availability_zone = optional(string)
group_id = optional(string)
group_name = optional(string)
host_id = optional(string)
host_resource_group_arn = optional(number)
spread_domain = optional(string)
tenancy = optional(string)
partition_number = optional(number)
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
})) | n/a | yes |
| [pool\_lambda\_reserved\_concurrent\_executions](#input\_pool\_lambda\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
| [pool\_lambda\_timeout](#input\_pool\_lambda\_timeout) | Time out for the pool lambda in seconds. | `number` | `60` | no |
| [prefix](#input\_prefix) | The prefix used for naming resources | `string` | `"github-actions"` | no |
diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf
index ea8fb17dce..e8454b5b69 100644
--- a/modules/multi-runner/runners.tf
+++ b/modules/multi-runner/runners.tf
@@ -32,6 +32,7 @@ module "runners" {
github_app_parameters = local.github_app_parameters
ebs_optimized = each.value.runner_config.ebs_optimized
enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
+ scale_errors = each.value.runner_config.scale_errors
enable_organization_runners = each.value.runner_config.enable_organization_runners
enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners
enable_jit_config = each.value.runner_config.enable_jit_config
diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf
index 0ca473ecf2..99c9a1ba00 100644
--- a/modules/multi-runner/variables.tf
+++ b/modules/multi-runner/variables.tf
@@ -71,14 +71,25 @@ variable "multi_runner_config" {
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null)
- create_service_linked_role_spot = optional(bool, false)
- credit_specification = optional(string, null)
- delay_webhook_event = optional(number, 30)
- disable_runner_autoupdate = optional(bool, false)
- ebs_optimized = optional(bool, false)
- enable_ephemeral_runners = optional(bool, false)
- enable_job_queued_check = optional(bool, null)
- enable_on_demand_failover_for_errors = optional(list(string), [])
+ create_service_linked_role_spot = optional(bool, false)
+ credit_specification = optional(string, null)
+ delay_webhook_event = optional(number, 30)
+ disable_runner_autoupdate = optional(bool, false)
+ ebs_optimized = optional(bool, false)
+ enable_ephemeral_runners = optional(bool, false)
+ enable_job_queued_check = optional(bool, null)
+ enable_on_demand_failover_for_errors = optional(list(string), [])
+ scale_errors = optional(list(string), [
+ "UnfulfillableCapacity",
+ "MaxSpotInstanceCountExceeded",
+ "TargetCapacityLimitExceededException",
+ "RequestLimitExceeded",
+ "ResourceLimitExceeded",
+ "MaxSpotInstanceCountExceeded",
+ "MaxSpotFleetRequestCountExceeded",
+ "InsufficientInstanceCapacity",
+ "InsufficientCapacityOnHost",
+ ])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
@@ -197,6 +208,7 @@ variable "multi_runner_config" {
enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
+ scale_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
enable_organization_runners: "Register runners to organization, instead of repo level"
enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
diff --git a/modules/runners/README.md b/modules/runners/README.md
index 9bb3a6f4e6..5955110a20 100644
--- a/modules/runners/README.md
+++ b/modules/runners/README.md
@@ -221,6 +221,7 @@ yarn run dist
| [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check. | `number` | `3` | no |
| [s3\_runner\_binaries](#input\_s3\_runner\_binaries) | Bucket details for cached GitHub binary. | object({
arn = string
id = string
key = string
}) | n/a | yes |
| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
+| [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` | [| no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. |
"UnfulfillableCapacity",
"MaxSpotInstanceCountExceeded",
"TargetCapacityLimitExceededException",
"RequestLimitExceeded",
"ResourceLimitExceeded",
"MaxSpotInstanceCountExceeded",
"MaxSpotFleetRequestCountExceeded",
"InsufficientInstanceCapacity",
"InsufficientCapacityOnHost"
]
object({
arn = string
url = string
}) | n/a | yes |
| [ssm\_housekeeper](#input\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.object({
schedule_expression = optional(string, "rate(1 day)")
state = optional(string, "ENABLED")
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
}) | {
"config": {}
} | no |
diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf
index 2019ebbc6f..7d03d1558b 100644
--- a/modules/runners/pool.tf
+++ b/modules/runners/pool.tf
@@ -42,6 +42,7 @@ module "pool" {
ephemeral = var.enable_ephemeral_runners
enable_jit_config = var.enable_jit_config
enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
+ scale_errors = var.scale_errors
boot_time_in_minutes = var.runner_boot_time_in_minutes
labels = var.runner_labels
launch_template = aws_launch_template.runner
diff --git a/modules/runners/pool/README.md b/modules/runners/pool/README.md
index 4bd268c37c..2578bb1001 100644
--- a/modules/runners/pool/README.md
+++ b/modules/runners/pool/README.md
@@ -49,7 +49,7 @@ No modules.
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| [aws\_partition](#input\_aws\_partition) | (optional) partition for the arn if not 'aws' | `string` | `"aws"` | no |
-| [config](#input\_config) | Lookup details in parent module. | object({
lambda = object({
log_level = string
logging_retention_in_days = number
logging_kms_key_id = string
reserved_concurrent_executions = number
s3_bucket = string
s3_key = string
s3_object_version = string
security_group_ids = list(string)
runtime = string
architecture = string
memory_size = number
timeout = number
zip = string
subnet_ids = list(string)
})
tags = map(string)
ghes = object({
url = string
ssl_verify = string
})
github_app_parameters = object({
key_base64 = map(string)
id = map(string)
})
subnet_ids = list(string)
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
name = string
})
group_name = string
name_prefix = string
pool_owner = string
role = object({
arn = string
})
})
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
instance_max_spot_price = string
prefix = string
pool = list(object({
schedule_expression = string
schedule_expression_timezone = string
size = number
}))
role_permissions_boundary = string
kms_key_arn = string
ami_kms_key_arn = string
ami_id_ssm_parameter_arn = string
role_path = string
ssm_token_path = string
ssm_config_path = string
ami_id_ssm_parameter_name = string
ami_id_ssm_parameter_read_policy_arn = string
arn_ssm_parameters_path_config = string
lambda_tags = map(string)
user_agent = string
}) | n/a | yes |
+| [config](#input\_config) | Lookup details in parent module. | object({
lambda = object({
log_level = string
logging_retention_in_days = number
logging_kms_key_id = string
reserved_concurrent_executions = number
s3_bucket = string
s3_key = string
s3_object_version = string
security_group_ids = list(string)
runtime = string
architecture = string
memory_size = number
timeout = number
zip = string
subnet_ids = list(string)
})
tags = map(string)
ghes = object({
url = string
ssl_verify = string
})
github_app_parameters = object({
key_base64 = map(string)
id = map(string)
})
subnet_ids = list(string)
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
scale_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
name = string
})
group_name = string
name_prefix = string
pool_owner = string
role = object({
arn = string
})
})
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
instance_max_spot_price = string
prefix = string
pool = list(object({
schedule_expression = string
schedule_expression_timezone = string
size = number
}))
role_permissions_boundary = string
kms_key_arn = string
ami_kms_key_arn = string
ami_id_ssm_parameter_arn = string
role_path = string
ssm_token_path = string
ssm_config_path = string
ami_id_ssm_parameter_name = string
ami_id_ssm_parameter_read_policy_arn = string
arn_ssm_parameters_path_config = string
lambda_tags = map(string)
user_agent = string
}) | n/a | yes |
| [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. | object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
}) | `{}` | no |
## Outputs
diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf
index e141b22d25..30f39a262a 100644
--- a/modules/runners/pool/main.tf
+++ b/modules/runners/pool/main.tf
@@ -47,6 +47,7 @@ resource "aws_lambda_function" "pool" {
POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests
POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.config.runner.enable_on_demand_failover_for_errors)
+ SCALE_ERRORS = jsonencode(var.config.runner.scale_errors)
}
}
diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf
index f1e841cde6..f1536d11c8 100644
--- a/modules/runners/pool/variables.tf
+++ b/modules/runners/pool/variables.tf
@@ -32,6 +32,7 @@ variable "config" {
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
+ scale_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf
index b97fefed4f..bb3fc83e52 100644
--- a/modules/runners/scale-up.tf
+++ b/modules/runners/scale-up.tf
@@ -59,6 +59,7 @@ resource "aws_lambda_function" "scale_up" {
SSM_CONFIG_PATH = "${var.ssm_paths.root}/${var.ssm_paths.config}"
SUBNET_IDS = join(",", var.subnet_ids)
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.enable_on_demand_failover_for_errors)
+ SCALE_ERRORS = jsonencode(var.scale_errors)
JOB_RETRY_CONFIG = jsonencode(local.job_retry_config)
}
}
diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf
index 8c8a1a136c..e40967fb89 100644
--- a/modules/runners/variables.tf
+++ b/modules/runners/variables.tf
@@ -701,6 +701,22 @@ variable "enable_on_demand_failover_for_errors" {
default = []
}
+variable "scale_errors" {
+ description = "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
+ type = list(string)
+ default = [
+ "UnfulfillableCapacity",
+ "MaxSpotInstanceCountExceeded",
+ "TargetCapacityLimitExceededException",
+ "RequestLimitExceeded",
+ "ResourceLimitExceeded",
+ "MaxSpotInstanceCountExceeded",
+ "MaxSpotFleetRequestCountExceeded",
+ "InsufficientInstanceCapacity",
+ "InsufficientCapacityOnHost",
+ ]
+}
+
variable "lambda_tags" {
description = "Map of tags that will be added to all the lambda function resources. Note these are additional tags to the default tags."
type = map(string)
diff --git a/variables.tf b/variables.tf
index bc9d3abac5..a65464afb3 100644
--- a/variables.tf
+++ b/variables.tf
@@ -283,6 +283,22 @@ variable "enable_runner_on_demand_failover_for_errors" {
default = []
}
+variable "scale_errors" {
+ description = "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
+ type = list(string)
+ default = [
+ "UnfulfillableCapacity",
+ "MaxSpotInstanceCountExceeded",
+ "TargetCapacityLimitExceededException",
+ "RequestLimitExceeded",
+ "ResourceLimitExceeded",
+ "MaxSpotInstanceCountExceeded",
+ "MaxSpotFleetRequestCountExceeded",
+ "InsufficientInstanceCapacity",
+ "InsufficientCapacityOnHost",
+ ]
+}
+
variable "enable_userdata" {
description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
type = bool