diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index abf34d87..04d79c12 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -491,11 +491,14 @@ module "slurm" { } }] - login_allocation_id = module.k8s.static_ip_allocation_id - login_public_ip = var.slurm_login_public_ip - tailscale_enabled = var.tailscale_enabled - login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name - login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys + login_allocation_id = module.k8s.static_ip_allocation_id + login_public_ip = var.slurm_login_public_ip + tailscale_enabled = var.tailscale_enabled + login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name + sssd_conf_secret_ref_name = var.slurm_sssd_conf_secret_ref_name + sssd_ldap_ca_config_map_ref_name = var.slurm_sssd_ldap_ca_config_map_ref_name + sssd_enabled = var.slurm_sssd_enabled + login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys flux_namespace = local.flux_namespace diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 4b893000..ac0854e3 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -418,6 +418,21 @@ slurm_login_public_ip = true # --- tailscale_enabled = false +# Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes. +# By default, false +# --- +slurm_sssd_enabled = false + +# Name of Secret containing sssd.conf for controller, login, and worker sssd containers. +# By default, empty +# --- +slurm_sssd_conf_secret_ref_name = "" + +# Name of ConfigMap containing LDAP CA certificates for controller, login, and worker sssd containers. +# By default, empty +# --- +slurm_sssd_ldap_ca_config_map_ref_name = "" + # Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user. # --- slurm_login_ssh_root_public_keys = [ diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index 7d476c3c..9f6c42d8 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -984,6 +984,24 @@ variable "slurm_login_sshd_config_map_ref_name" { default = "" } +variable "slurm_sssd_conf_secret_ref_name" { + description = "Name of Secret containing sssd.conf propagated to controller, login, and worker sssd containers." + type = string + default = "" +} + +variable "slurm_sssd_ldap_ca_config_map_ref_name" { + description = "Name of ConfigMap containing LDAP CA certificates propagated to controller, login, and worker sssd containers." + type = string + default = "" +} + +variable "slurm_sssd_enabled" { + description = "Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes." + type = bool + default = false +} + variable "slurm_login_ssh_root_public_keys" { description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user." type = list(string) diff --git a/soperator/modules/slurm/flux_release_nodesets.tf b/soperator/modules/slurm/flux_release_nodesets.tf index d2b0369d..5fa497ff 100644 --- a/soperator/modules/slurm/flux_release_nodesets.tf +++ b/soperator/modules/slurm/flux_release_nodesets.tf @@ -8,11 +8,23 @@ resource "local_file" "flux_release_rendered_nodesets" { nodesets = var.worker_nodesets resources = [for res in var.resources.worker : { - cpu_cores = res.cpu_cores - memory_gibibytes = res.memory_gibibytes - ephemeral_storage_gibibytes = res.ephemeral_storage_gibibytes - gpus = res.gpus - shared_memory = var.shared_memory_size_gibibytes + cpu_cores = floor( + res.cpu_cores + -local.resources.munge.cpu + -(var.sssd_enabled ? local.resources.sssd.cpu : 0) + ) - local.resources.kruise_daemon.cpu + memory_gibibytes = floor( + res.memory_gibibytes + -local.resources.munge.memory + -(var.sssd_enabled ? local.resources.sssd.memory : 0) + ) - local.resources.kruise_daemon.memory + ephemeral_storage_gibibytes = floor( + res.ephemeral_storage_gibibytes + -local.resources.munge.ephemeral_storage + -(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0) + ) + gpus = res.gpus + shared_memory = var.shared_memory_size_gibibytes }] jail_submounts = { @@ -47,6 +59,13 @@ resource "local_file" "flux_release_rendered_nodesets" { config_map_ref = var.worker_sshd_config_map_ref_name } + sssd = { + enabled = var.sssd_enabled + conf_secret_ref_name = var.sssd_conf_secret_ref_name + ldap_ca_config_map_ref_name = var.sssd_ldap_ca_config_map_ref_name + resources = local.resources.sssd + } + extra = local.slurm_node_extra }) } diff --git a/soperator/modules/slurm/locals.tf b/soperator/modules/slurm/locals.tf index 88793253..5f113508 100644 --- a/soperator/modules/slurm/locals.tf +++ b/soperator/modules/slurm/locals.tf @@ -97,6 +97,11 @@ locals { memory = 0.5 ephemeral_storage = 5 } + sssd = { + cpu = 0.2 + memory = 0.5 + ephemeral_storage = 5 + } exporter = { cpu = 0.25 memory = 0.25 diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index 4519e233..7bb000f4 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -178,19 +178,45 @@ resource "helm_release" "soperator_fluxcd_cm" { controller = { size = var.node_count.controller resources = { - cpu = floor(var.resources.controller.cpu_cores - local.resources.munge.cpu - local.resources.kruise_daemon.cpu) - memory = floor(var.resources.controller.memory_gibibytes - local.resources.munge.memory - local.resources.kruise_daemon.memory) - ephemeral_storage = floor(var.resources.controller.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage) + cpu = floor( + var.resources.controller.cpu_cores + -local.resources.munge.cpu + -(var.sssd_enabled ? local.resources.sssd.cpu : 0) + -local.resources.kruise_daemon.cpu + ) + memory = floor( + var.resources.controller.memory_gibibytes + -local.resources.munge.memory + -(var.sssd_enabled ? local.resources.sssd.memory : 0) + -local.resources.kruise_daemon.memory + ) + ephemeral_storage = floor( + var.resources.controller.ephemeral_storage_gibibytes + -local.resources.munge.ephemeral_storage + -(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0) + ) } } worker = { size = 0 resources = { - cpu = floor(var.resources.worker[0].cpu_cores - local.resources.munge.cpu) - local.resources.kruise_daemon.cpu - memory = floor(var.resources.worker[0].memory_gibibytes - local.resources.munge.memory) - local.resources.kruise_daemon.memory - ephemeral_storage = floor(var.resources.worker[0].ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage) - gpus = var.resources.worker[0].gpus + cpu = floor( + var.resources.worker[0].cpu_cores + -local.resources.munge.cpu + -(var.sssd_enabled ? local.resources.sssd.cpu : 0) + ) - local.resources.kruise_daemon.cpu + memory = floor( + var.resources.worker[0].memory_gibibytes + -local.resources.munge.memory + -(var.sssd_enabled ? local.resources.sssd.memory : 0) + ) - local.resources.kruise_daemon.memory + ephemeral_storage = floor( + var.resources.worker[0].ephemeral_storage_gibibytes + -local.resources.munge.ephemeral_storage + -(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0) + ) + gpus = var.resources.worker[0].gpus } shared_memory = var.shared_memory_size_gibibytes slurm_node_extra = local.slurm_node_extra @@ -204,9 +230,23 @@ resource "helm_release" "soperator_fluxcd_cm" { root_public_keys = var.login_ssh_root_public_keys public_ip = var.login_public_ip resources = { - cpu = floor(var.resources.login.cpu_cores - local.resources.munge.cpu - local.resources.kruise_daemon.cpu) - memory = floor(var.resources.login.memory_gibibytes - local.resources.munge.memory - local.resources.kruise_daemon.memory) - ephemeral_storage = floor(var.resources.login.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage) + cpu = floor( + var.resources.login.cpu_cores + -local.resources.munge.cpu + -(var.sssd_enabled ? local.resources.sssd.cpu : 0) + -local.resources.kruise_daemon.cpu + ) + memory = floor( + var.resources.login.memory_gibibytes + -local.resources.munge.memory + -(var.sssd_enabled ? local.resources.sssd.memory : 0) + -local.resources.kruise_daemon.memory + ) + ephemeral_storage = floor( + var.resources.login.ephemeral_storage_gibibytes + -local.resources.munge.ephemeral_storage + -(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0) + ) } } @@ -219,6 +259,13 @@ resource "helm_release" "soperator_fluxcd_cm" { resources = local.resources.munge } + sssd = { + enabled = var.sssd_enabled + conf_secret_ref_name = var.sssd_conf_secret_ref_name + ldap_ca_config_map_ref_name = var.sssd_ldap_ca_config_map_ref_name + resources = local.resources.sssd + } + rest = { enabled = var.rest_enabled resources = local.resources.rest diff --git a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl index 13695f4d..7a4e1453 100644 --- a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl @@ -204,6 +204,16 @@ nodesets: security: appArmorProfile: unconfined + sssd: + enabled: ${sssd.enabled} + resources: + cpu: ${sssd.resources.cpu * 1000}m + memory: ${sssd.resources.memory}Gi + ephemeralStorage: ${sssd.resources.ephemeral_storage}Gi + + sssdConfSecretRefName: ${jsonencode(sssd.conf_secret_ref_name)} + sssdLdapCAConfigMapRefName: ${jsonencode(sssd.ldap_ca_config_map_ref_name)} + configMapRefSupervisord: custom-supervisord-config configMapRefSshd: ${sshd.config_map_ref} diff --git a/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl index 8e695050..ae11bec8 100644 --- a/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl @@ -449,6 +449,15 @@ resources: k8sNodeFilterName: system slurmNodes: + sssd: + enabled: ${slurm_cluster.nodes.sssd.enabled} + resources: + cpu: ${slurm_cluster.nodes.sssd.resources.cpu * 1000}m + memory: ${slurm_cluster.nodes.sssd.resources.memory}Gi + ephemeralStorage: ${slurm_cluster.nodes.sssd.resources.ephemeral_storage}Gi + sssdConfSecretRefName: "${slurm_cluster.nodes.sssd.conf_secret_ref_name}" + sssdLdapCAConfigMapRefName: "${slurm_cluster.nodes.sssd.ldap_ca_config_map_ref_name}" + accounting: enabled: ${accounting_enabled} k8sNodeFilterName: ${slurm_cluster.k8s_node_filters.accounting.name} diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index ba770a24..4846912f 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -178,6 +178,24 @@ variable "login_sshd_config_map_ref_name" { default = "" } +variable "sssd_conf_secret_ref_name" { + description = "Name of Secret containing sssd.conf propagated to controller, login, and worker sssd containers." + type = string + default = "" +} + +variable "sssd_ldap_ca_config_map_ref_name" { + description = "Name of ConfigMap containing LDAP CA certificates propagated to controller, login, and worker sssd containers." + type = string + default = "" +} + +variable "sssd_enabled" { + description = "Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes." + type = bool + default = false +} + variable "login_ssh_root_public_keys" { description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user." type = list(string)