diff --git a/examples/default/main.tf b/examples/default/main.tf index 2addb372a4..509e6a315a 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -128,6 +128,7 @@ module "runners" { # metric = { # enable_spot_termination_warning = true # enable_job_retry = false + # enable_pool_sufficiency = true # enable_github_app_rate_limit = false # } # } diff --git a/examples/multi-runner/main.tf b/examples/multi-runner/main.tf index acbcdb8081..6085e248d5 100644 --- a/examples/multi-runner/main.tf +++ b/examples/multi-runner/main.tf @@ -150,6 +150,7 @@ module "runners" { # metric = { # enable_github_app_rate_limit = true # enable_job_retry = false + # enable_pool_sufficiency = true # enable_spot_termination_warning = true # } # } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 14c0a0422e..2005390666 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -12,6 +12,7 @@ import { RunnerInputParameters } from './../aws/runners.d'; import ScaleError from './ScaleError'; import * as scaleUpModule from './scale-up'; import { getParameter } from '@aws-github-runner/aws-ssm-util'; +import { createSingleMetric } from '@aws-github-runner/aws-powertools-util'; import { describe, it, expect, beforeEach, vi } from 'vitest'; const mockOctokit = { @@ -33,6 +34,7 @@ const mockCreateRunner = vi.mocked(createRunner); const mockListRunners = vi.mocked(listEC2Runners); const mockSSMClient = mockClient(SSMClient); const mockSSMgetParameter = vi.mocked(getParameter); +const mockCreateSingleMetric = vi.mocked(createSingleMetric); vi.mock('@octokit/rest', () => ({ Octokit: vi.fn().mockImplementation(() => mockOctokit), @@ -61,6 +63,22 @@ vi.mock('@aws-github-runner/aws-ssm-util', async () => { }; }); +vi.mock('@aws-github-runner/aws-powertools-util', async () => { + const actual = (await vi.importActual( + '@aws-github-runner/aws-powertools-util', + )) as typeof import('@aws-github-runner/aws-powertools-util'); + + return { + ...actual, + // eslint-disable-next-line @typescript-eslint/no-unused-vars + createSingleMetric: vi.fn((name: string, unit: string, value: number, dimensions?: Record) => { + return { + addMetadata: vi.fn(), + }; + }), + }; +}); + export type RunnerType = 'ephemeral' | 'non-ephemeral'; // for ephemeral and non-ephemeral runners @@ -183,6 +201,83 @@ describe('scaleUp with GHES', () => { expect(listEC2Runners).not.toBeCalled(); }); + describe('pool sufficiency metrics', () => { + beforeEach(() => { + process.env.ENABLE_ORGANIZATION_RUNNERS = 'true'; + process.env.ENVIRONMENT = 'test-env'; + }); + + it('records pool sufficiency metric as insufficient when scaling up', async () => { + process.env.ENABLE_METRIC_POOL_SUFFICIENCY = 'true'; + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + + mockListRunners.mockImplementation(async () => [ + { + instanceId: 'i-1234', + launchTime: new Date(), + type: 'Org', + owner: TEST_DATA.repositoryOwner, + }, + ]); + + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + + expect(mockCreateSingleMetric).toHaveBeenCalledWith('SufficientPoolHosts', 'Count', 0.0, { + Environment: 'test-env', + }); + }); + + it('records pool sufficiency metric as sufficient when job is not queued', async () => { + process.env.ENABLE_METRIC_POOL_SUFFICIENCY = 'true'; + + mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({ + data: { status: 'completed' }, + })); + + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + + expect(mockCreateSingleMetric).toHaveBeenCalledWith('SufficientPoolHosts', 'Count', 1.0, { + Environment: 'test-env', + }); + }); + + it('does not record pool sufficiency metric when disabled', async () => { + process.env.ENABLE_METRIC_POOL_SUFFICIENCY = 'false'; + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + + mockListRunners.mockImplementation(async () => [ + { + instanceId: 'i-1234', + launchTime: new Date(), + type: 'Org', + owner: TEST_DATA.repositoryOwner, + }, + ]); + + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + + expect(mockCreateSingleMetric).not.toHaveBeenCalled(); + }); + + it('does not record pool sufficiency metric when environment variable is undefined', async () => { + delete process.env.ENABLE_METRIC_POOL_SUFFICIENCY; + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + + mockListRunners.mockImplementation(async () => [ + { + instanceId: 'i-1234', + launchTime: new Date(), + type: 'Org', + owner: TEST_DATA.repositoryOwner, + }, + ]); + + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + + expect(mockCreateSingleMetric).not.toHaveBeenCalled(); + }); + }); + describe('on org level', () => { beforeEach(() => { process.env.ENABLE_ORGANIZATION_RUNNERS = 'true'; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 638edd3232..3dccea4311 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -1,5 +1,9 @@ import { Octokit } from '@octokit/rest'; -import { addPersistentContextToChildLogger, createChildLogger } from '@aws-github-runner/aws-powertools-util'; +import { + addPersistentContextToChildLogger, + createChildLogger, + createSingleMetric, +} from '@aws-github-runner/aws-powertools-util'; import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; @@ -9,6 +13,7 @@ import { RunnerInputParameters } from './../aws/runners.d'; import ScaleError from './ScaleError'; import { publishRetryMessage } from './job-retry'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; +import { MetricUnit } from '@aws-lambda-powertools/metrics'; const logger = createChildLogger('scale-up'); @@ -307,6 +312,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage if (scaleUp) { logger.info(`Attempting to launch a new runner`); + createPoolSufficiencyMetric(environment, payload, false); await createRunners( { @@ -348,6 +354,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage } } else { logger.info('No runner will be created, job is not queued.'); + createPoolSufficiencyMetric(environment, payload, true); } } @@ -473,3 +480,14 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins } } } + +function createPoolSufficiencyMetric(environment: string, payload: ActionRequestMessage, wasSufficient: boolean) { + if (yn(process.env.ENABLE_METRIC_POOL_SUFFICIENCY, { default: false })) { + const metric = createSingleMetric('SufficientPoolHosts', MetricUnit.Count, wasSufficient ? 1.0 : 0.0, { + Environment: environment, + }); + metric.addMetadata('Environment', environment); + metric.addMetadata('RepositoryName', payload.repositoryName); + metric.addMetadata('RepositoryOwner', payload.repositoryOwner); + } +} diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index edbdb33059..d2570b3d1c 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -697,6 +697,7 @@ variable "metrics" { metric = optional(object({ enable_github_app_rate_limit = optional(bool, true) enable_job_retry = optional(bool, true) + enable_pool_sufficiency = optional(bool, true) enable_spot_termination_warning = optional(bool, true) }), {}) }) diff --git a/modules/runners/job-retry/main.tf b/modules/runners/job-retry/main.tf index 9561c7db71..ea5dfd7924 100644 --- a/modules/runners/job-retry/main.tf +++ b/modules/runners/job-retry/main.tf @@ -5,6 +5,7 @@ locals { environment_variables = { ENABLE_ORGANIZATION_RUNNERS = var.config.enable_organization_runners ENABLE_METRIC_JOB_RETRY = var.config.metrics.enable && var.config.metrics.metric.enable_job_retry + ENABLE_METRIC_POOL_SUFFICIENCY = var.config.metrics.enable && var.config.metrics.metric.enable_pool_sufficiency ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = var.config.metrics.enable && var.config.metrics.metric.enable_github_app_rate_limit GHES_URL = var.config.ghes_url USER_AGENT = var.config.user_agent diff --git a/modules/runners/job-retry/variables.tf b/modules/runners/job-retry/variables.tf index 4a8fe19fbf..5b26336ba1 100644 --- a/modules/runners/job-retry/variables.tf +++ b/modules/runners/job-retry/variables.tf @@ -57,6 +57,7 @@ variable "config" { metric = optional(object({ enable_github_app_rate_limit = optional(bool, true) enable_job_retry = optional(bool, true) + enable_pool_sufficiency = optional(bool, true) }), {}) }), {}) prefix = optional(string, null) diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index 9230267c07..35e7d15a20 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -31,6 +31,7 @@ resource "aws_lambda_function" "scale_up" { ENABLE_JIT_CONFIG = var.enable_jit_config ENABLE_JOB_QUEUED_CHECK = local.enable_job_queued_check ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = var.metrics.enable && var.metrics.metric.enable_github_app_rate_limit + ENABLE_METRIC_POOL_SUFFICIENCY = var.metrics.enable && var.metrics.metric.enable_pool_sufficiency ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners ENVIRONMENT = var.prefix GHES_URL = var.ghes_url diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index a78231e7da..b3a88b74ea 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -727,6 +727,7 @@ variable "metrics" { metric = optional(object({ enable_github_app_rate_limit = optional(bool, true) enable_job_retry = optional(bool, true) + enable_pool_sufficiency = optional(bool, true) enable_spot_termination_warning = optional(bool, true) }), {}) }) diff --git a/variables.tf b/variables.tf index f412d2a486..39a9682828 100644 --- a/variables.tf +++ b/variables.tf @@ -926,6 +926,7 @@ variable "metrics" { metric = optional(object({ enable_github_app_rate_limit = optional(bool, true) enable_job_retry = optional(bool, true) + enable_pool_sufficiency = optional(bool, true) enable_spot_termination_warning = optional(bool, true) }), {}) })