From 1f832173996a6a8ed0442390fc1ca4f5c807adbf Mon Sep 17 00:00:00 2001 From: Saurabh Patel Date: Tue, 17 Mar 2026 16:02:27 -0500 Subject: [PATCH] Added option for CPU only checks. --- soperator/installations/example/terraform.tfvars | 1 + soperator/installations/example/variables.tf | 4 ++-- soperator/modules/slurm/locals_active_checks.tf | 14 ++++++++++++++ soperator/modules/slurm/variables.tf | 4 ++-- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 4b8930005..e5ff5bb50 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -450,6 +450,7 @@ slurm_exporter_enabled = true # - "testing" - to be used for Soperator E2E tests. # - "dev" - to be used for Soperator development clusters. # - "essential" - skip most of checks and run only essential ones. Don't use in production. +# - "cpu" - run only CPU-related checks during bootstrap (`ssh-check` and `mem-perf`). # --- active_checks_scope = "" diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index 7d476c3cf..49f2979e1 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -1208,8 +1208,8 @@ variable "active_checks_scope" { description = "Scope of active checks. Defines what active checks should be checked during cluster bootstrap." default = "" validation { - condition = contains(["dev", "testing", "prod_quick", "prod_acceptance", "essential"], var.active_checks_scope) - error_message = "active_checks_scope should be one of: dev, testing, prod_quick, prod_acceptance, essential." + condition = contains(["dev", "testing", "prod_quick", "prod_acceptance", "essential", "cpu"], var.active_checks_scope) + error_message = "active_checks_scope should be one of: dev, testing, prod_quick, prod_acceptance, essential, cpu." } } diff --git a/soperator/modules/slurm/locals_active_checks.tf b/soperator/modules/slurm/locals_active_checks.tf index e5d43e4f1..594ba2c51 100644 --- a/soperator/modules/slurm/locals_active_checks.tf +++ b/soperator/modules/slurm/locals_active_checks.tf @@ -125,6 +125,20 @@ locals { drainReasonPrefix = null } } + # Run only CPU-related validation checks + cpu = { + mem-perf = {} + ssh-check = { + k8sJobSpec = { + jobContainer = { + env = [{ + name : "NUM_OF_LOGIN_NODES", + value : tostring(var.node_count.login) + }] + } + } + } + } } soperator_activechecks_override_yaml = yamlencode(local.active_checks_scopes[var.active_checks_scope]) diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index ba770a246..b6dd24bd9 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -734,8 +734,8 @@ variable "active_checks_scope" { description = "Scope of active health-checks. Defines what checks should run after the cluster is provisioned." default = "" validation { - condition = contains(["dev", "testing", "prod_quick", "prod_acceptance", "essential"], var.active_checks_scope) - error_message = "active_checks_scope should be one of: dev, testing, prod_quick, prod_acceptance, essential." + condition = contains(["dev", "testing", "prod_quick", "prod_acceptance", "essential", "cpu"], var.active_checks_scope) + error_message = "active_checks_scope should be one of: dev, testing, prod_quick, prod_acceptance, essential, cpu." } } # endregion ActiveChecks