Skip to content

Commit 39bd07c

Browse files
authored
Merge pull request #885 from nebius/SCHED-1024
SCHED-1024 Soperator add SSSD support for LDAP integration
2 parents f587d2a + fdd3b2d commit 39bd07c

File tree

9 files changed

+164
-20
lines changed

9 files changed

+164
-20
lines changed

soperator/installations/example/main.tf

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -491,11 +491,14 @@ module "slurm" {
491491
}
492492
}]
493493

494-
login_allocation_id = module.k8s.static_ip_allocation_id
495-
login_public_ip = var.slurm_login_public_ip
496-
tailscale_enabled = var.tailscale_enabled
497-
login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name
498-
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys
494+
login_allocation_id = module.k8s.static_ip_allocation_id
495+
login_public_ip = var.slurm_login_public_ip
496+
tailscale_enabled = var.tailscale_enabled
497+
login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name
498+
sssd_conf_secret_ref_name = var.slurm_sssd_conf_secret_ref_name
499+
sssd_ldap_ca_config_map_ref_name = var.slurm_sssd_ldap_ca_config_map_ref_name
500+
sssd_enabled = var.slurm_sssd_enabled
501+
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys
499502

500503
flux_namespace = local.flux_namespace
501504

soperator/installations/example/terraform.tfvars

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,21 @@ slurm_login_public_ip = true
418418
# ---
419419
tailscale_enabled = false
420420

421+
# Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes.
422+
# By default, false
423+
# ---
424+
slurm_sssd_enabled = false
425+
426+
# Name of Secret containing sssd.conf for controller, login, and worker sssd containers.
427+
# By default, empty
428+
# ---
429+
slurm_sssd_conf_secret_ref_name = ""
430+
431+
# Name of ConfigMap containing LDAP CA certificates for controller, login, and worker sssd containers.
432+
# By default, empty
433+
# ---
434+
slurm_sssd_ldap_ca_config_map_ref_name = ""
435+
421436
# Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user.
422437
# ---
423438
slurm_login_ssh_root_public_keys = [

soperator/installations/example/variables.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,24 @@ variable "slurm_login_sshd_config_map_ref_name" {
984984
default = ""
985985
}
986986

987+
variable "slurm_sssd_conf_secret_ref_name" {
988+
description = "Name of Secret containing sssd.conf propagated to controller, login, and worker sssd containers."
989+
type = string
990+
default = ""
991+
}
992+
993+
variable "slurm_sssd_ldap_ca_config_map_ref_name" {
994+
description = "Name of ConfigMap containing LDAP CA certificates propagated to controller, login, and worker sssd containers."
995+
type = string
996+
default = ""
997+
}
998+
999+
variable "slurm_sssd_enabled" {
1000+
description = "Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes."
1001+
type = bool
1002+
default = false
1003+
}
1004+
9871005
variable "slurm_login_ssh_root_public_keys" {
9881006
description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user."
9891007
type = list(string)

soperator/modules/slurm/flux_release_nodesets.tf

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,23 @@ resource "local_file" "flux_release_rendered_nodesets" {
88

99
nodesets = var.worker_nodesets
1010
resources = [for res in var.resources.worker : {
11-
cpu_cores = res.cpu_cores
12-
memory_gibibytes = res.memory_gibibytes
13-
ephemeral_storage_gibibytes = res.ephemeral_storage_gibibytes
14-
gpus = res.gpus
15-
shared_memory = var.shared_memory_size_gibibytes
11+
cpu_cores = floor(
12+
res.cpu_cores
13+
-local.resources.munge.cpu
14+
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
15+
) - local.resources.kruise_daemon.cpu
16+
memory_gibibytes = floor(
17+
res.memory_gibibytes
18+
-local.resources.munge.memory
19+
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
20+
) - local.resources.kruise_daemon.memory
21+
ephemeral_storage_gibibytes = floor(
22+
res.ephemeral_storage_gibibytes
23+
-local.resources.munge.ephemeral_storage
24+
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
25+
)
26+
gpus = res.gpus
27+
shared_memory = var.shared_memory_size_gibibytes
1628
}]
1729

1830
jail_submounts = {
@@ -47,6 +59,13 @@ resource "local_file" "flux_release_rendered_nodesets" {
4759
config_map_ref = var.worker_sshd_config_map_ref_name
4860
}
4961

62+
sssd = {
63+
enabled = var.sssd_enabled
64+
conf_secret_ref_name = var.sssd_conf_secret_ref_name
65+
ldap_ca_config_map_ref_name = var.sssd_ldap_ca_config_map_ref_name
66+
resources = local.resources.sssd
67+
}
68+
5069
extra = local.slurm_node_extra
5170
})
5271
}

soperator/modules/slurm/locals.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ locals {
9797
memory = 0.5
9898
ephemeral_storage = 5
9999
}
100+
sssd = {
101+
cpu = 0.2
102+
memory = 0.5
103+
ephemeral_storage = 5
104+
}
100105
exporter = {
101106
cpu = 0.25
102107
memory = 0.25

soperator/modules/slurm/main.tf

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -178,19 +178,45 @@ resource "helm_release" "soperator_fluxcd_cm" {
178178
controller = {
179179
size = var.node_count.controller
180180
resources = {
181-
cpu = floor(var.resources.controller.cpu_cores - local.resources.munge.cpu - local.resources.kruise_daemon.cpu)
182-
memory = floor(var.resources.controller.memory_gibibytes - local.resources.munge.memory - local.resources.kruise_daemon.memory)
183-
ephemeral_storage = floor(var.resources.controller.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage)
181+
cpu = floor(
182+
var.resources.controller.cpu_cores
183+
-local.resources.munge.cpu
184+
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
185+
-local.resources.kruise_daemon.cpu
186+
)
187+
memory = floor(
188+
var.resources.controller.memory_gibibytes
189+
-local.resources.munge.memory
190+
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
191+
-local.resources.kruise_daemon.memory
192+
)
193+
ephemeral_storage = floor(
194+
var.resources.controller.ephemeral_storage_gibibytes
195+
-local.resources.munge.ephemeral_storage
196+
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
197+
)
184198
}
185199
}
186200

187201
worker = {
188202
size = 0
189203
resources = {
190-
cpu = floor(var.resources.worker[0].cpu_cores - local.resources.munge.cpu) - local.resources.kruise_daemon.cpu
191-
memory = floor(var.resources.worker[0].memory_gibibytes - local.resources.munge.memory) - local.resources.kruise_daemon.memory
192-
ephemeral_storage = floor(var.resources.worker[0].ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage)
193-
gpus = var.resources.worker[0].gpus
204+
cpu = floor(
205+
var.resources.worker[0].cpu_cores
206+
-local.resources.munge.cpu
207+
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
208+
) - local.resources.kruise_daemon.cpu
209+
memory = floor(
210+
var.resources.worker[0].memory_gibibytes
211+
-local.resources.munge.memory
212+
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
213+
) - local.resources.kruise_daemon.memory
214+
ephemeral_storage = floor(
215+
var.resources.worker[0].ephemeral_storage_gibibytes
216+
-local.resources.munge.ephemeral_storage
217+
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
218+
)
219+
gpus = var.resources.worker[0].gpus
194220
}
195221
shared_memory = var.shared_memory_size_gibibytes
196222
slurm_node_extra = local.slurm_node_extra
@@ -204,9 +230,23 @@ resource "helm_release" "soperator_fluxcd_cm" {
204230
root_public_keys = var.login_ssh_root_public_keys
205231
public_ip = var.login_public_ip
206232
resources = {
207-
cpu = floor(var.resources.login.cpu_cores - local.resources.munge.cpu - local.resources.kruise_daemon.cpu)
208-
memory = floor(var.resources.login.memory_gibibytes - local.resources.munge.memory - local.resources.kruise_daemon.memory)
209-
ephemeral_storage = floor(var.resources.login.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage)
233+
cpu = floor(
234+
var.resources.login.cpu_cores
235+
-local.resources.munge.cpu
236+
-(var.sssd_enabled ? local.resources.sssd.cpu : 0)
237+
-local.resources.kruise_daemon.cpu
238+
)
239+
memory = floor(
240+
var.resources.login.memory_gibibytes
241+
-local.resources.munge.memory
242+
-(var.sssd_enabled ? local.resources.sssd.memory : 0)
243+
-local.resources.kruise_daemon.memory
244+
)
245+
ephemeral_storage = floor(
246+
var.resources.login.ephemeral_storage_gibibytes
247+
-local.resources.munge.ephemeral_storage
248+
-(var.sssd_enabled ? local.resources.sssd.ephemeral_storage : 0)
249+
)
210250
}
211251
}
212252

@@ -219,6 +259,13 @@ resource "helm_release" "soperator_fluxcd_cm" {
219259
resources = local.resources.munge
220260
}
221261

262+
sssd = {
263+
enabled = var.sssd_enabled
264+
conf_secret_ref_name = var.sssd_conf_secret_ref_name
265+
ldap_ca_config_map_ref_name = var.sssd_ldap_ca_config_map_ref_name
266+
resources = local.resources.sssd
267+
}
268+
222269
rest = {
223270
enabled = var.rest_enabled
224271
resources = local.resources.rest

soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,16 @@ nodesets:
204204
security:
205205
appArmorProfile: unconfined
206206

207+
sssd:
208+
enabled: ${sssd.enabled}
209+
resources:
210+
cpu: ${sssd.resources.cpu * 1000}m
211+
memory: ${sssd.resources.memory}Gi
212+
ephemeralStorage: ${sssd.resources.ephemeral_storage}Gi
213+
214+
sssdConfSecretRefName: ${jsonencode(sssd.conf_secret_ref_name)}
215+
sssdLdapCAConfigMapRefName: ${jsonencode(sssd.ldap_ca_config_map_ref_name)}
216+
207217
configMapRefSupervisord: custom-supervisord-config
208218
configMapRefSshd: ${sshd.config_map_ref}
209219

soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,15 @@ resources:
449449
k8sNodeFilterName: system
450450

451451
slurmNodes:
452+
sssd:
453+
enabled: ${slurm_cluster.nodes.sssd.enabled}
454+
resources:
455+
cpu: ${slurm_cluster.nodes.sssd.resources.cpu * 1000}m
456+
memory: ${slurm_cluster.nodes.sssd.resources.memory}Gi
457+
ephemeralStorage: ${slurm_cluster.nodes.sssd.resources.ephemeral_storage}Gi
458+
sssdConfSecretRefName: "${slurm_cluster.nodes.sssd.conf_secret_ref_name}"
459+
sssdLdapCAConfigMapRefName: "${slurm_cluster.nodes.sssd.ldap_ca_config_map_ref_name}"
460+
452461
accounting:
453462
enabled: ${accounting_enabled}
454463
k8sNodeFilterName: ${slurm_cluster.k8s_node_filters.accounting.name}

soperator/modules/slurm/variables.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,24 @@ variable "login_sshd_config_map_ref_name" {
178178
default = ""
179179
}
180180

181+
variable "sssd_conf_secret_ref_name" {
182+
description = "Name of Secret containing sssd.conf propagated to controller, login, and worker sssd containers."
183+
type = string
184+
default = ""
185+
}
186+
187+
variable "sssd_ldap_ca_config_map_ref_name" {
188+
description = "Name of ConfigMap containing LDAP CA certificates propagated to controller, login, and worker sssd containers."
189+
type = string
190+
default = ""
191+
}
192+
193+
variable "sssd_enabled" {
194+
description = "Whether to enable the SSSD sidecar on Slurm controller, login, and worker nodes."
195+
type = bool
196+
default = false
197+
}
198+
181199
variable "login_ssh_root_public_keys" {
182200
description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user."
183201
type = list(string)

0 commit comments

Comments
 (0)