Upstream eks/actions-runner-controller with var.auto_update_enabled (cloudposse/terraform-aws-components#1095)

milldr · web-flow · commit b82ffea6da26 · 2024-08-12T12:34:26.000-04:00
diff --git a/src/README.md b/src/README.md
@@ -384,15 +384,13 @@ scale down to zero before finishing all the jobs, leaving some waiting indefinit
 the `max_duration` to a time long enough to cover the full time a job may have to wait between the time it is queued and
 the time it finishes, assuming that the HRA scales up the pool by 1 and runs the job on the new runner.
 
-:::info
-
-If there are more jobs queued than there are runners allowed by `maxReplicas`, the timeout timer does not start on the
-capacity reservation until enough reservations ahead of it are removed for it to be considered as representing and
-active job. Although there are some edge cases regarding `max_duration` that seem not to be covered properly (see
-[actions-runner-controller issue #2466](https://github.com/actions/actions-runner-controller/issues/2466)), they only
-merit adding a few extra minutes to the timeout.
-
-:::
+> [!TIP]
+>
+> If there are more jobs queued than there are runners allowed by `maxReplicas`, the timeout timer does not start on the
+> capacity reservation until enough reservations ahead of it are removed for it to be considered as representing and
+> active job. Although there are some edge cases regarding `max_duration` that seem not to be covered properly (see
+> [actions-runner-controller issue #2466](https://github.com/actions/actions-runner-controller/issues/2466)), they only
+> merit adding a few extra minutes to the timeout.
 
 ### Recommended `max_duration` Duration
 
@@ -570,7 +568,7 @@ documentation for further details.
 | <a name="input_regex_replace_chars"></a> [regex\_replace\_chars](#input\_regex\_replace\_chars) | Terraform regular expression (regex) string.<br>Characters matching the regex will be removed from the ID elements.<br>If not set, `"/[^a-zA-Z0-9-]/"` is used to remove all characters other than hyphens, letters and digits. | `string` | `null` | no |
 | <a name="input_region"></a> [region](#input\_region) | AWS Region. | `string` | n/a | yes |
 | <a name="input_resources"></a> [resources](#input\_resources) | The cpu and memory of the deployment's limits and requests. | <pre>object({<br>    limits = object({<br>      cpu    = string<br>      memory = string<br>    })<br>    requests = object({<br>      cpu    = string<br>      memory = string<br>    })<br>  })</pre> | n/a | yes |
-| <a name="input_runners"></a> [runners](#input\_runners) | Map of Action Runner configurations, with the key being the name of the runner. Please note that the name must be in<br>kebab-case.<br><br>For example:<pre>hcl<br>organization_runner = {<br>  type = "organization" # can be either 'organization' or 'repository'<br>  dind_enabled: true # A Docker daemon will be started in the runner Pod<br>  image: summerwind/actions-runner-dind # If dind_enabled=false, set this to 'summerwind/actions-runner'<br>  scope = "ACME"  # org name for Organization runners, repo name for Repository runners<br>  group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.<br>  scale_down_delay_seconds = 300<br>  min_replicas = 1<br>  max_replicas = 5<br>  labels = [<br>    "Ubuntu",<br>    "core-automation",<br>  ]<br>}</pre> | <pre>map(object({<br>    type            = string<br>    scope           = string<br>    group           = optional(string, null)<br>    image           = optional(string, "summerwind/actions-runner-dind")<br>    dind_enabled    = optional(bool, true)<br>    node_selector   = optional(map(string), {})<br>    pod_annotations = optional(map(string), {})<br><br>    # running_pod_annotations are only applied to the pods once they start running a job<br>    running_pod_annotations = optional(map(string), {})<br><br>    # affinity is too complex to model. Whatever you assigned affinity will be copied<br>    # to the runner Pod spec.<br>    affinity = optional(any)<br><br>    tolerations = optional(list(object({<br>      key      = string<br>      operator = string<br>      value    = optional(string, null)<br>      effect   = string<br>    })), [])<br>    scale_down_delay_seconds = optional(number, 300)<br>    min_replicas             = number<br>    max_replicas             = number<br>    # Scheduled overrides. See https://github.com/actions/actions-runner-controller/blob/master/docs/automatically-scaling-runners.md#scheduled-overrides<br>    # Order is important. The earlier entry is prioritized higher than later entries. So you usually define<br>    # one-time overrides at the top of your list, then yearly, monthly, weekly, and lastly daily overrides.<br>    scheduled_overrides = optional(list(object({<br>      start_time   = string # ISO 8601 format, eg,  "2021-06-01T00:00:00+09:00"<br>      end_time     = string # ISO 8601 format, eg,  "2021-06-01T00:00:00+09:00"<br>      min_replicas = optional(number)<br>      max_replicas = optional(number)<br>      recurrence_rule = optional(object({<br>        frequency  = string           # One of Daily, Weekly, Monthly, Yearly<br>        until_time = optional(string) # ISO 8601 format time after which the schedule will no longer apply<br>      }))<br>    })), [])<br>    busy_metrics = optional(object({<br>      scale_up_threshold    = string<br>      scale_down_threshold  = string<br>      scale_up_adjustment   = optional(string)<br>      scale_down_adjustment = optional(string)<br>      scale_up_factor       = optional(string)<br>      scale_down_factor     = optional(string)<br>    }))<br>    webhook_driven_scaling_enabled = optional(bool, true)<br>    # max_duration is the duration after which a job will be considered completed,<br>    # even if the webhook has not received a "job completed" event.<br>    # This is to ensure that if an event is missed, it does not leave the runner running forever.<br>    # Set it long enough to cover the longest job you expect to run and then some.<br>    # See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268<br>    # Defaults to 1 hour programmatically (to be able to detect if both max_duration and webhook_startup_timeout are set).<br>    max_duration = optional(string)<br>    # The name `webhook_startup_timeout` was misleading and has been deprecated.<br>    # It has been renamed `max_duration`.<br>    webhook_startup_timeout = optional(string)<br>    # Adjust the time (in seconds) to wait for the Docker in Docker daemon to become responsive.<br>    wait_for_docker_seconds     = optional(string, "")<br>    pull_driven_scaling_enabled = optional(bool, false)<br>    labels                      = optional(list(string), [])<br>    # If not null, `docker_storage` specifies the size (as `go` string) of<br>    # an ephemeral (default storage class) Persistent Volume to allocate for the Docker daemon.<br>    # Takes precedence over `tmpfs_enabled` for the Docker daemon storage.<br>    docker_storage = optional(string, null)<br>    # storage is deprecated in favor of docker_storage, since it is only storage for the Docker daemon<br>    storage = optional(string, null)<br>    # If `pvc_enabled` is true, a Persistent Volume Claim will be created for the runner<br>    # and mounted at /home/runner/work/shared. This is useful for sharing data between runners.<br>    pvc_enabled = optional(bool, false)<br>    # If `tmpfs_enabled` is `true`, both the runner and the docker daemon will use a tmpfs volume,<br>    # meaning that all data will be stored in RAM rather than on disk, bypassing disk I/O limitations,<br>    # but what would have been disk usage is now additional memory usage. You must specify memory<br>    # requests and limits when using tmpfs or else the Pod will likely crash the Node.<br>    tmpfs_enabled = optional(bool)<br>    resources = optional(object({<br>      limits = optional(object({<br>        cpu               = optional(string, "1")<br>        memory            = optional(string, "1Gi")<br>        ephemeral_storage = optional(string, "10Gi")<br>      }), {})<br>      requests = optional(object({<br>        cpu               = optional(string, "500m")<br>        memory            = optional(string, "256Mi")<br>        ephemeral_storage = optional(string, "1Gi")<br>      }), {})<br>    }), {})<br>  }))</pre> | n/a | yes |
+| <a name="input_runners"></a> [runners](#input\_runners) | Map of Action Runner configurations, with the key being the name of the runner. Please note that the name must be in<br>kebab-case.<br><br>For example:<pre>hcl<br>organization_runner = {<br>  type = "organization" # can be either 'organization' or 'repository'<br>  dind_enabled: true # A Docker daemon will be started in the runner Pod<br>  image: summerwind/actions-runner-dind # If dind_enabled=false, set this to 'summerwind/actions-runner'<br>  scope = "ACME"  # org name for Organization runners, repo name for Repository runners<br>  group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.<br>  scale_down_delay_seconds = 300<br>  min_replicas = 1<br>  max_replicas = 5<br>  labels = [<br>    "Ubuntu",<br>    "core-automation",<br>  ]<br>}</pre> | <pre>map(object({<br>    type                = string<br>    scope               = string<br>    group               = optional(string, null)<br>    image               = optional(string, "summerwind/actions-runner-dind")<br>    auto_update_enabled = optional(bool, true)<br>    dind_enabled        = optional(bool, true)<br>    node_selector       = optional(map(string), {})<br>    pod_annotations     = optional(map(string), {})<br><br>    # running_pod_annotations are only applied to the pods once they start running a job<br>    running_pod_annotations = optional(map(string), {})<br><br>    # affinity is too complex to model. Whatever you assigned affinity will be copied<br>    # to the runner Pod spec.<br>    affinity = optional(any)<br><br>    tolerations = optional(list(object({<br>      key      = string<br>      operator = string<br>      value    = optional(string, null)<br>      effect   = string<br>    })), [])<br>    scale_down_delay_seconds = optional(number, 300)<br>    min_replicas             = number<br>    max_replicas             = number<br>    # Scheduled overrides. See https://github.com/actions/actions-runner-controller/blob/master/docs/automatically-scaling-runners.md#scheduled-overrides<br>    # Order is important. The earlier entry is prioritized higher than later entries. So you usually define<br>    # one-time overrides at the top of your list, then yearly, monthly, weekly, and lastly daily overrides.<br>    scheduled_overrides = optional(list(object({<br>      start_time   = string # ISO 8601 format, eg,  "2021-06-01T00:00:00+09:00"<br>      end_time     = string # ISO 8601 format, eg,  "2021-06-01T00:00:00+09:00"<br>      min_replicas = optional(number)<br>      max_replicas = optional(number)<br>      recurrence_rule = optional(object({<br>        frequency  = string           # One of Daily, Weekly, Monthly, Yearly<br>        until_time = optional(string) # ISO 8601 format time after which the schedule will no longer apply<br>      }))<br>    })), [])<br>    busy_metrics = optional(object({<br>      scale_up_threshold    = string<br>      scale_down_threshold  = string<br>      scale_up_adjustment   = optional(string)<br>      scale_down_adjustment = optional(string)<br>      scale_up_factor       = optional(string)<br>      scale_down_factor     = optional(string)<br>    }))<br>    webhook_driven_scaling_enabled = optional(bool, true)<br>    # max_duration is the duration after which a job will be considered completed,<br>    # even if the webhook has not received a "job completed" event.<br>    # This is to ensure that if an event is missed, it does not leave the runner running forever.<br>    # Set it long enough to cover the longest job you expect to run and then some.<br>    # See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268<br>    # Defaults to 1 hour programmatically (to be able to detect if both max_duration and webhook_startup_timeout are set).<br>    max_duration = optional(string)<br>    # The name `webhook_startup_timeout` was misleading and has been deprecated.<br>    # It has been renamed `max_duration`.<br>    webhook_startup_timeout = optional(string)<br>    # Adjust the time (in seconds) to wait for the Docker in Docker daemon to become responsive.<br>    wait_for_docker_seconds     = optional(string, "")<br>    pull_driven_scaling_enabled = optional(bool, false)<br>    labels                      = optional(list(string), [])<br>    # If not null, `docker_storage` specifies the size (as `go` string) of<br>    # an ephemeral (default storage class) Persistent Volume to allocate for the Docker daemon.<br>    # Takes precedence over `tmpfs_enabled` for the Docker daemon storage.<br>    docker_storage = optional(string, null)<br>    # storage is deprecated in favor of docker_storage, since it is only storage for the Docker daemon<br>    storage = optional(string, null)<br>    # If `pvc_enabled` is true, a Persistent Volume Claim will be created for the runner<br>    # and mounted at /home/runner/work/shared. This is useful for sharing data between runners.<br>    pvc_enabled = optional(bool, false)<br>    # If `tmpfs_enabled` is `true`, both the runner and the docker daemon will use a tmpfs volume,<br>    # meaning that all data will be stored in RAM rather than on disk, bypassing disk I/O limitations,<br>    # but what would have been disk usage is now additional memory usage. You must specify memory<br>    # requests and limits when using tmpfs or else the Pod will likely crash the Node.<br>    tmpfs_enabled = optional(bool)<br>    resources = optional(object({<br>      limits = optional(object({<br>        cpu    = optional(string, "1")<br>        memory = optional(string, "1Gi")<br>        # ephemeral-storage is the Kubernetes name, but `ephemeral_storage` is the gomplate name,<br>        # so allow either. If both are specified, `ephemeral-storage` takes precedence.<br>        ephemeral-storage = optional(string)<br>        ephemeral_storage = optional(string, "10Gi")<br>      }), {})<br>      requests = optional(object({<br>        cpu    = optional(string, "500m")<br>        memory = optional(string, "256Mi")<br>        # ephemeral-storage is the Kubernetes name, but `ephemeral_storage` is the gomplate name,<br>        # so allow either. If both are specified, `ephemeral-storage` takes precedence.<br>        ephemeral-storage = optional(string)<br>        ephemeral_storage = optional(string, "1Gi")<br>      }), {})<br>    }), {})<br>  }))</pre> | n/a | yes |
 | <a name="input_s3_bucket_arns"></a> [s3\_bucket\_arns](#input\_s3\_bucket\_arns) | List of ARNs of S3 Buckets to which the runners will have read-write access to. | `list(string)` | `[]` | no |
 | <a name="input_ssm_docker_config_json_path"></a> [ssm\_docker\_config\_json\_path](#input\_ssm\_docker\_config\_json\_path) | SSM path to the Docker config JSON | `string` | `null` | no |
 | <a name="input_ssm_github_secret_path"></a> [ssm\_github\_secret\_path](#input\_ssm\_github\_secret\_path) | The path in SSM to the GitHub app private key file contents or GitHub PAT token. | `string` | `""` | no |
diff --git a/src/charts/actions-runner/Chart.yaml b/src/charts/actions-runner/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.3.1
+version: 0.3.2
 
 # This chart only deploys Resources for actions-runner-controller, so app version does not really apply.
 # We use Resource API version instead.
diff --git a/src/charts/actions-runner/templates/runnerdeployment.yaml b/src/charts/actions-runner/templates/runnerdeployment.yaml
@@ -139,12 +139,21 @@ spec:
         # to report its status and deregister itself from the runner pool.
         - name: RUNNER_GRACEFUL_STOP_TIMEOUT
           value: "80"
+        - name: DISABLE_RUNNER_UPDATE
+          value: "{{ printf "%v" (not .Values.auto_update_enabled) }}"
         {{- with .Values.wait_for_docker_seconds }}
         # If Docker is taking too long to start (which is likely due to some other performance issue),
         # increase the timeout from the default of 120 seconds.
         - name: WAIT_FOR_DOCKER_SECONDS
           value: "{{ . }}"
         {{- end }}
+        {{- if $use_tmpfs }}
+        - name: RUNNER_HOME
+          value: "/runner-tmpfs"
+        - name: RUNNER_WORKDIR
+          value: "/runner-tmpfs/_work"
+        {{- end }}
+
       # You could reserve nodes for runners by labeling and tainting nodes with
       #   node-role.kubernetes.io/actions-runner
       # and then adding the following to this RunnerDeployment
@@ -206,6 +215,7 @@ spec:
         {{- end }}
       # dockerdWithinRunnerContainer = false means access to a Docker daemon is provided by a sidecar container.
       dockerdWithinRunnerContainer: {{ $use_dind_in_runner }}
+      dockerEnabled: {{ $use_dind }}
       image: {{ .Values.image | quote }}
       imagePullPolicy: IfNotPresent
       {{- if  $use_dockerconfig }}
@@ -217,15 +227,23 @@ spec:
         limits:
           cpu: {{ .Values.resources.limits.cpu }}
           memory: {{ .Values.resources.limits.memory }}
+          {{- if index .Values.resources.limits "ephemeral-storage" }}
+          ephemeral-storage: {{ index .Values.resources.limits "ephemeral-storage" }}
+          {{- else }}
           {{- if index .Values.resources.limits "ephemeral_storage" }}
           ephemeral-storage: {{ .Values.resources.limits.ephemeral_storage }}
           {{- end }}
+          {{- end }}
         requests:
           cpu: {{ .Values.resources.requests.cpu }}
           memory: {{ .Values.resources.requests.memory }}
+          {{- if index .Values.resources.requests "ephemeral-storage" }}
+          ephemeral-storage: {{ index .Values.resources.requests "ephemeral-storage" }}
+          {{- else }}
           {{- if index .Values.resources.requests "ephemeral_storage" }}
           ephemeral-storage: {{ .Values.resources.requests.ephemeral_storage }}
           {{- end }}
+          {{- end }}
       {{- if and (not $use_dind_in_runner) (or .Values.docker_storage $use_tmpfs) }}
       {{- /* dockerVolumeMounts are mounted into the docker sidecar, and ignored if running with dockerdWithinRunnerContainer */}}
       dockerVolumeMounts:
@@ -251,14 +269,14 @@ spec:
         {{- if $use_tmpfs }}
         - mountPath: /tmp
           name: tmp
-        - mountPath: /runner/_work
-          name: work
+        - mountPath: /runner-tmpfs
+          name: runner-tmpfs
         {{- end }}
       {{- end }}{{/* End of volumeMounts */}}
       {{- if or (and $use_dind (or .Values.docker_storage $use_tmpfs)) $use_pvc $use_dockerconfig (not (empty .Values.running_pod_annotations)) }}
       volumes:
         {{- if $use_tmpfs }}
-        - name: work
+        - name: runner-tmpfs
           emptyDir:
             medium: Memory
         - name: tmp
diff --git a/src/main.tf b/src/main.tf
@@ -225,6 +225,7 @@ module "actions_runner" {
       type                           = each.value.type
       scope                          = each.value.scope
       image                          = each.value.image
+      auto_update_enabled            = each.value.auto_update_enabled
       dind_enabled                   = each.value.dind_enabled
       service_account_role_arn       = module.actions_runner_controller.service_account_role_arn
       resources                      = each.value.resources
diff --git a/src/variables.tf b/src/variables.tf