diff --git a/README.md b/README.md index 1ca2e82e22..7de101b517 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh) | [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | | [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.

`schedule_expression`: is used to configure the schedule for the lambda.
`enabled`: enable or disable the lambda trigger via the EventBridge.
`lambda_memory_size`: lambda memory size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
|
{
"config": {}
}
| no | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | +| [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` |
[
"UnfulfillableCapacity",
"MaxSpotInstanceCountExceeded",
"TargetCapacityLimitExceededException",
"RequestLimitExceeded",
"ResourceLimitExceeded",
"MaxSpotInstanceCountExceeded",
"MaxSpotFleetRequestCountExceeded",
"InsufficientInstanceCapacity",
"InsufficientCapacityOnHost"
]
| no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. |
object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
use_prefix = optional(bool, true)
})
| `{}` | no | | [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no | diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 72ff9e3e1a..3975d093f5 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -44,4 +44,5 @@ export interface RunnerInputParameters { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + scaleErrors: string[]; } diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index c4fd922fd0..c3cc6d8487 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -429,6 +429,7 @@ describe('create runner with errors', () => { allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED, capacityType: 'spot', type: 'Repo', + scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded'], }; const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = { type: 'Repo', @@ -699,6 +700,7 @@ interface RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + scaleErrors: string[]; } function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { @@ -718,6 +720,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName, tracingEnabled: runnerConfig.tracingEnabled, onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError, + scaleErrors: runnerConfig.scaleErrors, }; } diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index d95dc99fa4..0f4fc5bee9 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -196,18 +196,7 @@ async function processFleetResult( return instances; } - // Educated guess of errors that would make sense to retry based on the list - // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html - const scaleErrors = [ - 'UnfulfillableCapacity', - 'MaxSpotInstanceCountExceeded', - 'TargetCapacityLimitExceededException', - 'RequestLimitExceeded', - 'ResourceLimitExceeded', - 'MaxSpotInstanceCountExceeded', - 'MaxSpotFleetRequestCountExceeded', - 'InsufficientInstanceCapacity', - ]; + const scaleErrors = runnerParameters.scaleErrors; const failedCount = countScaleErrors(errors, scaleErrors); if (failedCount > 0) { diff --git a/lambdas/functions/control-plane/src/modules.d.ts b/lambdas/functions/control-plane/src/modules.d.ts index 7570f29035..ff447c0e51 100644 --- a/lambdas/functions/control-plane/src/modules.d.ts +++ b/lambdas/functions/control-plane/src/modules.d.ts @@ -3,6 +3,7 @@ declare namespace NodeJS { AWS_REGION: string; ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string; ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string; + SCALE_ERRORS: string; ENVIRONMENT: string; GHES_URL: string; JOB_RETRY_CONFIG: string; diff --git a/lambdas/functions/control-plane/src/pool/pool.test.ts b/lambdas/functions/control-plane/src/pool/pool.test.ts index c05a8b8cb7..aaa6aea715 100644 --- a/lambdas/functions/control-plane/src/pool/pool.test.ts +++ b/lambdas/functions/control-plane/src/pool/pool.test.ts @@ -140,6 +140,8 @@ beforeEach(() => { process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot'; process.env.RUNNER_OWNER = ORG; process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString(); + process.env.SCALE_ERRORS = + '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]'; const mockTokenReturnValue = { data: { diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index aa690e97f6..4da3e7355d 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -41,6 +41,7 @@ export async function adjust(event: PoolEvent): Promise { const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string]) : []; + const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string]; const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl(); @@ -95,6 +96,7 @@ export async function adjust(event: PoolEvent): Promise { amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, + scaleErrors, }, topUp, githubInstallationClient, diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index b876d31d50..315bb8ab0b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -105,6 +105,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = { subnets: ['subnet-123'], tracingEnabled: false, onDemandFailoverOnError: [], + scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'], }; let expectedRunnerParams: RunnerInputParameters; @@ -122,6 +123,8 @@ function setDefaults() { process.env.INSTANCE_TYPES = 'm5.large'; process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot'; process.env.ENABLE_ON_DEMAND_FAILOVER = undefined; + process.env.SCALE_ERRORS = + '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]'; } beforeEach(() => { @@ -809,6 +812,16 @@ describe('scaleUp with public GH', () => { }); }); + it('creates a runner with correct config and labels and custom scale errors enabled.', async () => { + process.env.RUNNER_LABELS = 'label1,label2'; + process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']); + await scaleUpModule.scaleUp(TEST_DATA); + expect(createRunner).toBeCalledWith({ + ...expectedRunnerParams, + scaleErrors: ['RequestLimitExceeded'], + }); + }); + it('creates a runner and ensure the group argument is ignored', async () => { process.env.RUNNER_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 35df7ea5d7..e271b901df 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -62,6 +62,7 @@ interface CreateEC2RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + scaleErrors: string[]; } function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) { @@ -255,6 +256,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [matcher\_config\_parameter\_store\_tier](#input\_matcher\_config\_parameter\_store\_tier) | The tier of the parameter store for the matcher configuration. Valid values are `Standard`, and `Advanced`. | `string` | `"Standard"` | no | | [metrics](#input\_metrics) | Configuration for metrics created by the module, by default metrics are disabled to avoid additional costs. When metrics are enable all metrics are created unless explicit configured otherwise. |
object({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_github_app_rate_limit = optional(bool, true)
enable_job_retry = optional(bool, true)
enable_spot_termination_warning = optional(bool, true)
}), {})
})
| `{}` | no | -| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {
runner\_config: {
runner\_os: "The EC2 Operating System type to use for action runner instances (linux,windows)."
runner\_architecture: "The platform architecture of the runner instance\_type."
runner\_metadata\_options: "(Optional) Metadata options for the ec2 runner instances."
ami: "(Optional) AMI configuration for the action runner instances. This object allows you to specify all AMI-related settings in one place."
create\_service\_linked\_role\_spot: (Optional) create the serviced linked role for spot instances that is required by the scale-up lambda.
credit\_specification: "(Optional) The credit specification of the runner instance\_type. Can be unset, `standard` or `unlimited`.
delay\_webhook\_event: "The number of seconds the event accepted by the webhook is invisible on the queue before the scale up lambda will receive the event."
disable\_runner\_autoupdate: "Disable the auto update of the github runner agent. Be aware there is a grace period of 30 days, see also the [GitHub article](https://github.blog/changelog/2022-02-01-github-actions-self-hosted-runners-can-now-disable-automatic-updates/)"
ebs\_optimized: "The EC2 EBS optimized configuration."
enable\_ephemeral\_runners: "Enable ephemeral runners, runners will only be used once."
enable\_job\_queued\_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
enable\_on\_demand\_failover\_for\_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
enable\_organization\_runners: "Register runners to organization, instead of repo level"
enable\_runner\_binaries\_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
enable\_ssm\_on\_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
enable\_userdata: "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
instance\_allocation\_strategy: "The allocation strategy for spot instances. AWS recommends to use `capacity-optimized` however the AWS default is `lowest-price`."
instance\_max\_spot\_price: "Max price price for spot instances per hour. This variable will be passed to the create fleet as max spot price for the fleet."
instance\_target\_capacity\_type: "Default lifecycle used for runner instances, can be either `spot` or `on-demand`."
instance\_types: "List of instance types for the action runner. Defaults are based on runner\_os (al2023 for linux and Windows Server Core for win)."
job\_queue\_retention\_in\_seconds: "The number of seconds the job is held in the queue before it is purged"
minimum\_running\_time\_in\_minutes: "The time an ec2 action runner should be running at minimum before terminated if not busy."
pool\_runner\_owner: "The pool will deploy runners to the GitHub org ID, set this value to the org to which you want the runners deployed. Repo level is not supported."
runner\_additional\_security\_group\_ids: "List of additional security groups IDs to apply to the runner. If added outside the multi\_runner\_config block, the additional security group(s) will be applied to all runner configs. If added inside the multi\_runner\_config, the additional security group(s) will be applied to the individual runner."
runner\_as\_root: "Run the action runner under the root user. Variable `runner_run_as` will be ignored."
runner\_boot\_time\_in\_minutes: "The minimum time for an EC2 runner to boot and register as a runner."
runner\_disable\_default\_labels: "Disable default labels for the runners (os, architecture and `self-hosted`). If enabled, the runner will only have the extra labels provided in `runner_extra_labels`. In case you on own start script is used, this configuration parameter needs to be parsed via SSM."
runner\_extra\_labels: "Extra (custom) labels for the runners (GitHub). Separate each label by a comma. Labels checks on the webhook can be enforced by setting `multi_runner_config.matcherConfig.exactMatch`. GitHub read-only labels should not be provided."
runner\_group\_name: "Name of the runner group."
runner\_name\_prefix: "Prefix for the GitHub runner name."
runner\_run\_as: "Run the GitHub actions agent as user."
runners\_maximum\_count: "The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check."
scale\_down\_schedule\_expression: "Scheduler expression to check every x for scale down."
scale\_up\_reserved\_concurrent\_executions: "Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations."
userdata\_template: "Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored."
enable\_jit\_config "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is available. In case you are upgrading from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI."
enable\_runner\_detailed\_monitoring: "Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details."
enable\_cloudwatch\_agent: "Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`."
cloudwatch\_config: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
userdata\_pre\_install: "Script to be ran before the GitHub Actions runner is installed on the EC2 instances"
userdata\_post\_install: "Script to be ran after the GitHub Actions runner is installed on the EC2 instances"
runner\_hook\_job\_started: "Script to be ran in the runner environment at the beginning of every job"
runner\_hook\_job\_completed: "Script to be ran in the runner environment at the end of every job"
runner\_ec2\_tags: "Map of tags that will be added to the launch template instance tag specifications."
runner\_iam\_role\_managed\_policy\_arns: "Attach AWS or customer-managed IAM policies (by ARN) to the runner IAM role"
vpc\_id: "The VPC for security groups of the action runners. If not set uses the value of `var.vpc_id`."
subnet\_ids: "List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. If not set, uses the value of `var.subnet_ids`."
idle\_config: "List of time period that can be defined as cron expression to keep a minimum amount of runners active instead of scaling down to 0. By defining this list you can ensure that in time periods that match the cron expression within 5 seconds a runner is kept idle."
runner\_log\_files: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
block\_device\_mappings: "The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops`, `throughput`, `kms_key_id`, `snapshot_id`."
job\_retry: "Experimental! Can be removed / changed without trigger a major release. Configure job retries. The configuration enables job retries (for ephemeral runners). After creating the instances a message will be published to a job retry queue. The job retry check lambda is checking after a delay if the job is queued. If not the message will be published again on the scale-up (build queue). Using this feature can impact the rate limit of the GitHub app."
pool\_config: "The configuration for updating the pool. The `pool_size` to adjust to by the events triggered by the `schedule_expression`. For example you can configure a cron expression for week days to adjust the pool to 10 and another expression for the weekend to adjust the pool to 1. Use `schedule_expression_timezone` to override the schedule time zone (defaults to UTC)."
}
matcherConfig: {
labelMatchers: "The list of list of labels supported by the runner configuration. `[[self-hosted, linux, x64, example]]`"
exactMatch: "If set to true all labels in the workflow job must match the GitHub labels (os, architecture and `self-hosted`). When false if __any__ workflow label matches it will trigger the webhook."
priority: "If set it defines the priority of the matcher, the matcher with the lowest priority will be evaluated first. Default is 999, allowed values 0-999."
}
redrive\_build\_queue: "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting `enabled` to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries."
} |
map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null)
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
placement = optional(object({
affinity = optional(string)
availability_zone = optional(string)
group_id = optional(string)
group_name = optional(string)
host_id = optional(string)
host_resource_group_arn = optional(number)
spread_domain = optional(string)
tenancy = optional(string)
partition_number = optional(number)
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
}))
| n/a | yes | +| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {
runner\_config: {
runner\_os: "The EC2 Operating System type to use for action runner instances (linux,windows)."
runner\_architecture: "The platform architecture of the runner instance\_type."
runner\_metadata\_options: "(Optional) Metadata options for the ec2 runner instances."
ami: "(Optional) AMI configuration for the action runner instances. This object allows you to specify all AMI-related settings in one place."
create\_service\_linked\_role\_spot: (Optional) create the serviced linked role for spot instances that is required by the scale-up lambda.
credit\_specification: "(Optional) The credit specification of the runner instance\_type. Can be unset, `standard` or `unlimited`.
delay\_webhook\_event: "The number of seconds the event accepted by the webhook is invisible on the queue before the scale up lambda will receive the event."
disable\_runner\_autoupdate: "Disable the auto update of the github runner agent. Be aware there is a grace period of 30 days, see also the [GitHub article](https://github.blog/changelog/2022-02-01-github-actions-self-hosted-runners-can-now-disable-automatic-updates/)"
ebs\_optimized: "The EC2 EBS optimized configuration."
enable\_ephemeral\_runners: "Enable ephemeral runners, runners will only be used once."
enable\_job\_queued\_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
enable\_on\_demand\_failover\_for\_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
scale\_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
enable\_organization\_runners: "Register runners to organization, instead of repo level"
enable\_runner\_binaries\_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
enable\_ssm\_on\_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
enable\_userdata: "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
instance\_allocation\_strategy: "The allocation strategy for spot instances. AWS recommends to use `capacity-optimized` however the AWS default is `lowest-price`."
instance\_max\_spot\_price: "Max price price for spot instances per hour. This variable will be passed to the create fleet as max spot price for the fleet."
instance\_target\_capacity\_type: "Default lifecycle used for runner instances, can be either `spot` or `on-demand`."
instance\_types: "List of instance types for the action runner. Defaults are based on runner\_os (al2023 for linux and Windows Server Core for win)."
job\_queue\_retention\_in\_seconds: "The number of seconds the job is held in the queue before it is purged"
minimum\_running\_time\_in\_minutes: "The time an ec2 action runner should be running at minimum before terminated if not busy."
pool\_runner\_owner: "The pool will deploy runners to the GitHub org ID, set this value to the org to which you want the runners deployed. Repo level is not supported."
runner\_additional\_security\_group\_ids: "List of additional security groups IDs to apply to the runner. If added outside the multi\_runner\_config block, the additional security group(s) will be applied to all runner configs. If added inside the multi\_runner\_config, the additional security group(s) will be applied to the individual runner."
runner\_as\_root: "Run the action runner under the root user. Variable `runner_run_as` will be ignored."
runner\_boot\_time\_in\_minutes: "The minimum time for an EC2 runner to boot and register as a runner."
runner\_disable\_default\_labels: "Disable default labels for the runners (os, architecture and `self-hosted`). If enabled, the runner will only have the extra labels provided in `runner_extra_labels`. In case you on own start script is used, this configuration parameter needs to be parsed via SSM."
runner\_extra\_labels: "Extra (custom) labels for the runners (GitHub). Separate each label by a comma. Labels checks on the webhook can be enforced by setting `multi_runner_config.matcherConfig.exactMatch`. GitHub read-only labels should not be provided."
runner\_group\_name: "Name of the runner group."
runner\_name\_prefix: "Prefix for the GitHub runner name."
runner\_run\_as: "Run the GitHub actions agent as user."
runners\_maximum\_count: "The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check."
scale\_down\_schedule\_expression: "Scheduler expression to check every x for scale down."
scale\_up\_reserved\_concurrent\_executions: "Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations."
userdata\_template: "Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored."
enable\_jit\_config "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is available. In case you are upgrading from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI."
enable\_runner\_detailed\_monitoring: "Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details."
enable\_cloudwatch\_agent: "Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`."
cloudwatch\_config: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
userdata\_pre\_install: "Script to be ran before the GitHub Actions runner is installed on the EC2 instances"
userdata\_post\_install: "Script to be ran after the GitHub Actions runner is installed on the EC2 instances"
runner\_hook\_job\_started: "Script to be ran in the runner environment at the beginning of every job"
runner\_hook\_job\_completed: "Script to be ran in the runner environment at the end of every job"
runner\_ec2\_tags: "Map of tags that will be added to the launch template instance tag specifications."
runner\_iam\_role\_managed\_policy\_arns: "Attach AWS or customer-managed IAM policies (by ARN) to the runner IAM role"
vpc\_id: "The VPC for security groups of the action runners. If not set uses the value of `var.vpc_id`."
subnet\_ids: "List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. If not set, uses the value of `var.subnet_ids`."
idle\_config: "List of time period that can be defined as cron expression to keep a minimum amount of runners active instead of scaling down to 0. By defining this list you can ensure that in time periods that match the cron expression within 5 seconds a runner is kept idle."
runner\_log\_files: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
block\_device\_mappings: "The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops`, `throughput`, `kms_key_id`, `snapshot_id`."
job\_retry: "Experimental! Can be removed / changed without trigger a major release. Configure job retries. The configuration enables job retries (for ephemeral runners). After creating the instances a message will be published to a job retry queue. The job retry check lambda is checking after a delay if the job is queued. If not the message will be published again on the scale-up (build queue). Using this feature can impact the rate limit of the GitHub app."
pool\_config: "The configuration for updating the pool. The `pool_size` to adjust to by the events triggered by the `schedule_expression`. For example you can configure a cron expression for week days to adjust the pool to 10 and another expression for the weekend to adjust the pool to 1. Use `schedule_expression_timezone` to override the schedule time zone (defaults to UTC)."
}
matcherConfig: {
labelMatchers: "The list of list of labels supported by the runner configuration. `[[self-hosted, linux, x64, example]]`"
exactMatch: "If set to true all labels in the workflow job must match the GitHub labels (os, architecture and `self-hosted`). When false if __any__ workflow label matches it will trigger the webhook."
priority: "If set it defines the priority of the matcher, the matcher with the lowest priority will be evaluated first. Default is 999, allowed values 0-999."
}
redrive\_build\_queue: "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting `enabled` to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries."
} |
map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null)
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
scale_errors = optional(list(string), [
"UnfulfillableCapacity",
"MaxSpotInstanceCountExceeded",
"TargetCapacityLimitExceededException",
"RequestLimitExceeded",
"ResourceLimitExceeded",
"MaxSpotInstanceCountExceeded",
"MaxSpotFleetRequestCountExceeded",
"InsufficientInstanceCapacity",
"InsufficientCapacityOnHost",
])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
placement = optional(object({
affinity = optional(string)
availability_zone = optional(string)
group_id = optional(string)
group_name = optional(string)
host_id = optional(string)
host_resource_group_arn = optional(number)
spread_domain = optional(string)
tenancy = optional(string)
partition_number = optional(number)
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
}))
| n/a | yes | | [pool\_lambda\_reserved\_concurrent\_executions](#input\_pool\_lambda\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [pool\_lambda\_timeout](#input\_pool\_lambda\_timeout) | Time out for the pool lambda in seconds. | `number` | `60` | no | | [prefix](#input\_prefix) | The prefix used for naming resources | `string` | `"github-actions"` | no | diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf index ea8fb17dce..e8454b5b69 100644 --- a/modules/multi-runner/runners.tf +++ b/modules/multi-runner/runners.tf @@ -32,6 +32,7 @@ module "runners" { github_app_parameters = local.github_app_parameters ebs_optimized = each.value.runner_config.ebs_optimized enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors + scale_errors = each.value.runner_config.scale_errors enable_organization_runners = each.value.runner_config.enable_organization_runners enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners enable_jit_config = each.value.runner_config.enable_jit_config diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index 0ca473ecf2..99c9a1ba00 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -71,14 +71,25 @@ variable "multi_runner_config" { id_ssm_parameter_arn = optional(string, null) kms_key_arn = optional(string, null) }), null) - create_service_linked_role_spot = optional(bool, false) - credit_specification = optional(string, null) - delay_webhook_event = optional(number, 30) - disable_runner_autoupdate = optional(bool, false) - ebs_optimized = optional(bool, false) - enable_ephemeral_runners = optional(bool, false) - enable_job_queued_check = optional(bool, null) - enable_on_demand_failover_for_errors = optional(list(string), []) + create_service_linked_role_spot = optional(bool, false) + credit_specification = optional(string, null) + delay_webhook_event = optional(number, 30) + disable_runner_autoupdate = optional(bool, false) + ebs_optimized = optional(bool, false) + enable_ephemeral_runners = optional(bool, false) + enable_job_queued_check = optional(bool, null) + enable_on_demand_failover_for_errors = optional(list(string), []) + scale_errors = optional(list(string), [ + "UnfulfillableCapacity", + "MaxSpotInstanceCountExceeded", + "TargetCapacityLimitExceededException", + "RequestLimitExceeded", + "ResourceLimitExceeded", + "MaxSpotInstanceCountExceeded", + "MaxSpotFleetRequestCountExceeded", + "InsufficientInstanceCapacity", + "InsufficientCapacityOnHost", + ]) enable_organization_runners = optional(bool, false) enable_runner_binaries_syncer = optional(bool, true) enable_ssm_on_runners = optional(bool, false) @@ -197,6 +208,7 @@ variable "multi_runner_config" { enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once." enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners." enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later." + scale_errors: "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" enable_organization_runners: "Register runners to organization, instead of repo level" enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI." enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances." diff --git a/modules/runners/README.md b/modules/runners/README.md index 9bb3a6f4e6..5955110a20 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -221,6 +221,7 @@ yarn run dist | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check. | `number` | `3` | no | | [s3\_runner\_binaries](#input\_s3\_runner\_binaries) | Bucket details for cached GitHub binary. |
object({
arn = string
id = string
key = string
})
| n/a | yes | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | +| [scale\_errors](#input\_scale\_errors) | List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts | `list(string)` |
[
"UnfulfillableCapacity",
"MaxSpotInstanceCountExceeded",
"TargetCapacityLimitExceededException",
"RequestLimitExceeded",
"ResourceLimitExceeded",
"MaxSpotInstanceCountExceeded",
"MaxSpotFleetRequestCountExceeded",
"InsufficientInstanceCapacity",
"InsufficientCapacityOnHost"
]
| no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. |
object({
arn = string
url = string
})
| n/a | yes | | [ssm\_housekeeper](#input\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.

`schedule_expression`: is used to configure the schedule for the lambda.
`state`: state of the cloudwatch event rule. Valid values are `DISABLED`, `ENABLED`, and `ENABLED_WITH_ALL_CLOUDTRAIL_MANAGEMENT_EVENTS`.
`lambda_memory_size`: lambda memory size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
state = optional(string, "ENABLED")
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
|
{
"config": {}
}
| no | diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf index 2019ebbc6f..7d03d1558b 100644 --- a/modules/runners/pool.tf +++ b/modules/runners/pool.tf @@ -42,6 +42,7 @@ module "pool" { ephemeral = var.enable_ephemeral_runners enable_jit_config = var.enable_jit_config enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors + scale_errors = var.scale_errors boot_time_in_minutes = var.runner_boot_time_in_minutes labels = var.runner_labels launch_template = aws_launch_template.runner diff --git a/modules/runners/pool/README.md b/modules/runners/pool/README.md index 4bd268c37c..2578bb1001 100644 --- a/modules/runners/pool/README.md +++ b/modules/runners/pool/README.md @@ -49,7 +49,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [aws\_partition](#input\_aws\_partition) | (optional) partition for the arn if not 'aws' | `string` | `"aws"` | no | -| [config](#input\_config) | Lookup details in parent module. |
object({
lambda = object({
log_level = string
logging_retention_in_days = number
logging_kms_key_id = string
reserved_concurrent_executions = number
s3_bucket = string
s3_key = string
s3_object_version = string
security_group_ids = list(string)
runtime = string
architecture = string
memory_size = number
timeout = number
zip = string
subnet_ids = list(string)
})
tags = map(string)
ghes = object({
url = string
ssl_verify = string
})
github_app_parameters = object({
key_base64 = map(string)
id = map(string)
})
subnet_ids = list(string)
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
name = string
})
group_name = string
name_prefix = string
pool_owner = string
role = object({
arn = string
})
})
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
instance_max_spot_price = string
prefix = string
pool = list(object({
schedule_expression = string
schedule_expression_timezone = string
size = number
}))
role_permissions_boundary = string
kms_key_arn = string
ami_kms_key_arn = string
ami_id_ssm_parameter_arn = string
role_path = string
ssm_token_path = string
ssm_config_path = string
ami_id_ssm_parameter_name = string
ami_id_ssm_parameter_read_policy_arn = string
arn_ssm_parameters_path_config = string
lambda_tags = map(string)
user_agent = string
})
| n/a | yes | +| [config](#input\_config) | Lookup details in parent module. |
object({
lambda = object({
log_level = string
logging_retention_in_days = number
logging_kms_key_id = string
reserved_concurrent_executions = number
s3_bucket = string
s3_key = string
s3_object_version = string
security_group_ids = list(string)
runtime = string
architecture = string
memory_size = number
timeout = number
zip = string
subnet_ids = list(string)
})
tags = map(string)
ghes = object({
url = string
ssl_verify = string
})
github_app_parameters = object({
key_base64 = map(string)
id = map(string)
})
subnet_ids = list(string)
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
scale_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
name = string
})
group_name = string
name_prefix = string
pool_owner = string
role = object({
arn = string
})
})
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
instance_max_spot_price = string
prefix = string
pool = list(object({
schedule_expression = string
schedule_expression_timezone = string
size = number
}))
role_permissions_boundary = string
kms_key_arn = string
ami_kms_key_arn = string
ami_id_ssm_parameter_arn = string
role_path = string
ssm_token_path = string
ssm_config_path = string
ami_id_ssm_parameter_name = string
ami_id_ssm_parameter_read_policy_arn = string
arn_ssm_parameters_path_config = string
lambda_tags = map(string)
user_agent = string
})
| n/a | yes | | [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. |
object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
})
| `{}` | no | ## Outputs diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf index e141b22d25..30f39a262a 100644 --- a/modules/runners/pool/main.tf +++ b/modules/runners/pool/main.tf @@ -47,6 +47,7 @@ resource "aws_lambda_function" "pool" { POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.config.runner.enable_on_demand_failover_for_errors) + SCALE_ERRORS = jsonencode(var.config.runner.scale_errors) } } diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf index f1e841cde6..f1536d11c8 100644 --- a/modules/runners/pool/variables.tf +++ b/modules/runners/pool/variables.tf @@ -32,6 +32,7 @@ variable "config" { ephemeral = bool enable_jit_config = bool enable_on_demand_failover_for_errors = list(string) + scale_errors = list(string) boot_time_in_minutes = number labels = list(string) launch_template = object({ diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index b97fefed4f..bb3fc83e52 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -59,6 +59,7 @@ resource "aws_lambda_function" "scale_up" { SSM_CONFIG_PATH = "${var.ssm_paths.root}/${var.ssm_paths.config}" SUBNET_IDS = join(",", var.subnet_ids) ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.enable_on_demand_failover_for_errors) + SCALE_ERRORS = jsonencode(var.scale_errors) JOB_RETRY_CONFIG = jsonencode(local.job_retry_config) } } diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index 8c8a1a136c..e40967fb89 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -701,6 +701,22 @@ variable "enable_on_demand_failover_for_errors" { default = [] } +variable "scale_errors" { + description = "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" + type = list(string) + default = [ + "UnfulfillableCapacity", + "MaxSpotInstanceCountExceeded", + "TargetCapacityLimitExceededException", + "RequestLimitExceeded", + "ResourceLimitExceeded", + "MaxSpotInstanceCountExceeded", + "MaxSpotFleetRequestCountExceeded", + "InsufficientInstanceCapacity", + "InsufficientCapacityOnHost", + ] +} + variable "lambda_tags" { description = "Map of tags that will be added to all the lambda function resources. Note these are additional tags to the default tags." type = map(string) diff --git a/variables.tf b/variables.tf index bc9d3abac5..a65464afb3 100644 --- a/variables.tf +++ b/variables.tf @@ -283,6 +283,22 @@ variable "enable_runner_on_demand_failover_for_errors" { default = [] } +variable "scale_errors" { + description = "List of aws error codes that should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" + type = list(string) + default = [ + "UnfulfillableCapacity", + "MaxSpotInstanceCountExceeded", + "TargetCapacityLimitExceededException", + "RequestLimitExceeded", + "ResourceLimitExceeded", + "MaxSpotInstanceCountExceeded", + "MaxSpotFleetRequestCountExceeded", + "InsufficientInstanceCapacity", + "InsufficientCapacityOnHost", + ] +} + variable "enable_userdata" { description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI." type = bool