Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,14 @@ module "slurm" {
}
}]

login_allocation_id = module.k8s.static_ip_allocation_id
login_public_ip = var.slurm_login_public_ip
tailscale_enabled = var.tailscale_enabled
login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys
login_allocation_id = module.k8s.static_ip_allocation_id
login_public_ip = var.slurm_login_public_ip
tailscale_enabled = var.tailscale_enabled
login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name
sssd_conf_secret_ref_name = var.slurm_sssd_conf_secret_ref_name
sssd_ldap_ca_config_map_ref_name = var.slurm_sssd_ldap_ca_config_map_ref_name
sssd_enabled = var.slurm_sssd_enabled
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys

flux_namespace = local.flux_namespace

Expand Down
15 changes: 15 additions & 0 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,21 @@ slurm_login_public_ip = true
# ---
tailscale_enabled = false

# Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes.
# By default, false
# ---
slurm_sssd_enabled = false

# Name of Secret containing sssd.conf for controller, login, and worker sssd containers.
# By default, empty
# ---
slurm_sssd_conf_secret_ref_name = ""

# Name of ConfigMap containing LDAP CA certificates for controller, login, and worker sssd containers.
# By default, empty
# ---
slurm_sssd_ldap_ca_config_map_ref_name = ""

# Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user.
# ---
slurm_login_ssh_root_public_keys = [
Expand Down
18 changes: 18 additions & 0 deletions soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,24 @@ variable "slurm_login_sshd_config_map_ref_name" {
default = ""
}

variable "slurm_sssd_conf_secret_ref_name" {
description = "Name of Secret containing sssd.conf propagated to controller, login, and worker sssd containers."
type = string
default = ""
}

variable "slurm_sssd_ldap_ca_config_map_ref_name" {
description = "Name of ConfigMap containing LDAP CA certificates propagated to controller, login, and worker sssd containers."
type = string
default = ""
}

variable "slurm_sssd_enabled" {
description = "Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes."
type = bool
default = false
}

variable "slurm_login_ssh_root_public_keys" {
description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user."
type = list(string)
Expand Down
29 changes: 24 additions & 5 deletions soperator/modules/slurm/flux_release_nodesets.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,23 @@ resource "local_file" "flux_release_rendered_nodesets" {

nodesets = var.worker_nodesets
resources = [for res in var.resources.worker : {
cpu_cores = res.cpu_cores
memory_gibibytes = res.memory_gibibytes
ephemeral_storage_gibibytes = res.ephemeral_storage_gibibytes
gpus = res.gpus
shared_memory = var.shared_memory_size_gibibytes
cpu_cores = floor(
res.cpu_cores
-local.resources.munge.cpu
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
) - local.resources.kruise_daemon.cpu
memory_gibibytes = floor(
res.memory_gibibytes
-local.resources.munge.memory
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
) - local.resources.kruise_daemon.memory
ephemeral_storage_gibibytes = floor(
res.ephemeral_storage_gibibytes
-local.resources.munge.ephemeral_storage
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
)
gpus = res.gpus
shared_memory = var.shared_memory_size_gibibytes
}]

jail_submounts = {
Expand Down Expand Up @@ -47,6 +59,13 @@ resource "local_file" "flux_release_rendered_nodesets" {
config_map_ref = var.worker_sshd_config_map_ref_name
}

sssd = {
enabled = var.sssd_enabled
conf_secret_ref_name = var.sssd_conf_secret_ref_name
ldap_ca_config_map_ref_name = var.sssd_ldap_ca_config_map_ref_name
resources = local.resources.sssd
}

extra = local.slurm_node_extra
})
}
5 changes: 5 additions & 0 deletions soperator/modules/slurm/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ locals {
memory = 0.5
ephemeral_storage = 5
}
sssd = {
cpu = 0.2
memory = 0.5
ephemeral_storage = 5
}
exporter = {
cpu = 0.25
memory = 0.25
Expand Down
67 changes: 57 additions & 10 deletions soperator/modules/slurm/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -178,19 +178,45 @@ resource "helm_release" "soperator_fluxcd_cm" {
controller = {
size = var.node_count.controller
resources = {
cpu = floor(var.resources.controller.cpu_cores - local.resources.munge.cpu - local.resources.kruise_daemon.cpu)
memory = floor(var.resources.controller.memory_gibibytes - local.resources.munge.memory - local.resources.kruise_daemon.memory)
ephemeral_storage = floor(var.resources.controller.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage)
cpu = floor(
var.resources.controller.cpu_cores
-local.resources.munge.cpu
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
-local.resources.kruise_daemon.cpu
)
memory = floor(
var.resources.controller.memory_gibibytes
-local.resources.munge.memory
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
-local.resources.kruise_daemon.memory
)
ephemeral_storage = floor(
var.resources.controller.ephemeral_storage_gibibytes
-local.resources.munge.ephemeral_storage
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
)
}
}

worker = {
size = 0
resources = {
cpu = floor(var.resources.worker[0].cpu_cores - local.resources.munge.cpu) - local.resources.kruise_daemon.cpu
memory = floor(var.resources.worker[0].memory_gibibytes - local.resources.munge.memory) - local.resources.kruise_daemon.memory
ephemeral_storage = floor(var.resources.worker[0].ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage)
gpus = var.resources.worker[0].gpus
cpu = floor(
var.resources.worker[0].cpu_cores
-local.resources.munge.cpu
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
) - local.resources.kruise_daemon.cpu
memory = floor(
var.resources.worker[0].memory_gibibytes
-local.resources.munge.memory
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
) - local.resources.kruise_daemon.memory
ephemeral_storage = floor(
var.resources.worker[0].ephemeral_storage_gibibytes
-local.resources.munge.ephemeral_storage
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
)
gpus = var.resources.worker[0].gpus
}
shared_memory = var.shared_memory_size_gibibytes
slurm_node_extra = local.slurm_node_extra
Expand All @@ -204,9 +230,23 @@ resource "helm_release" "soperator_fluxcd_cm" {
root_public_keys = var.login_ssh_root_public_keys
public_ip = var.login_public_ip
resources = {
cpu = floor(var.resources.login.cpu_cores - local.resources.munge.cpu - local.resources.kruise_daemon.cpu)
memory = floor(var.resources.login.memory_gibibytes - local.resources.munge.memory - local.resources.kruise_daemon.memory)
ephemeral_storage = floor(var.resources.login.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage)
cpu = floor(
var.resources.login.cpu_cores
-local.resources.munge.cpu
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
-local.resources.kruise_daemon.cpu
)
memory = floor(
var.resources.login.memory_gibibytes
-local.resources.munge.memory
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
-local.resources.kruise_daemon.memory
)
ephemeral_storage = floor(
var.resources.login.ephemeral_storage_gibibytes
-local.resources.munge.ephemeral_storage
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
)
}
}

Expand All @@ -219,6 +259,13 @@ resource "helm_release" "soperator_fluxcd_cm" {
resources = local.resources.munge
}

sssd = {
enabled = var.sssd_enabled
conf_secret_ref_name = var.sssd_conf_secret_ref_name
ldap_ca_config_map_ref_name = var.sssd_ldap_ca_config_map_ref_name
resources = local.resources.sssd
}

rest = {
enabled = var.rest_enabled
resources = local.resources.rest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,16 @@ nodesets:
security:
appArmorProfile: unconfined

sssd:
enabled: ${sssd.enabled}
resources:
cpu: ${sssd.resources.cpu * 1000}m
memory: ${sssd.resources.memory}Gi
ephemeralStorage: ${sssd.resources.ephemeral_storage}Gi

sssdConfSecretRefName: ${jsonencode(sssd.conf_secret_ref_name)}
sssdLdapCAConfigMapRefName: ${jsonencode(sssd.ldap_ca_config_map_ref_name)}

configMapRefSupervisord: custom-supervisord-config
configMapRefSshd: ${sshd.config_map_ref}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,15 @@ resources:
k8sNodeFilterName: system

slurmNodes:
sssd:
enabled: ${slurm_cluster.nodes.sssd.enabled}
resources:
cpu: ${slurm_cluster.nodes.sssd.resources.cpu * 1000}m
memory: ${slurm_cluster.nodes.sssd.resources.memory}Gi
ephemeralStorage: ${slurm_cluster.nodes.sssd.resources.ephemeral_storage}Gi
sssdConfSecretRefName: "${slurm_cluster.nodes.sssd.conf_secret_ref_name}"
sssdLdapCAConfigMapRefName: "${slurm_cluster.nodes.sssd.ldap_ca_config_map_ref_name}"

accounting:
enabled: ${accounting_enabled}
k8sNodeFilterName: ${slurm_cluster.k8s_node_filters.accounting.name}
Expand Down
18 changes: 18 additions & 0 deletions soperator/modules/slurm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,24 @@ variable "login_sshd_config_map_ref_name" {
default = ""
}

variable "sssd_conf_secret_ref_name" {
description = "Name of Secret containing sssd.conf propagated to controller, login, and worker sssd containers."
type = string
default = ""
}

variable "sssd_ldap_ca_config_map_ref_name" {
description = "Name of ConfigMap containing LDAP CA certificates propagated to controller, login, and worker sssd containers."
type = string
default = ""
}

variable "sssd_enabled" {
description = "Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes."
type = bool
default = false
}

variable "login_ssh_root_public_keys" {
description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user."
type = list(string)
Expand Down
Loading