-
Notifications
You must be signed in to change notification settings - Fork 30
Expand file tree
/
Copy pathmain.tf
More file actions
571 lines (483 loc) · 19.6 KB
/
main.tf
File metadata and controls
571 lines (483 loc) · 19.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
locals {
resources = {
system = module.resources.by_platform[var.slurm_nodeset_system.resource.platform][var.slurm_nodeset_system.resource.preset]
controller = module.resources.by_platform[var.slurm_nodeset_controller.resource.platform][var.slurm_nodeset_controller.resource.preset]
workers = [for worker in var.slurm_nodeset_workers : module.resources.by_platform[worker.resource.platform][worker.resource.preset]]
login = module.resources.by_platform[var.slurm_nodeset_login.resource.platform][var.slurm_nodeset_login.resource.preset]
accounting = var.slurm_nodeset_accounting != null ? module.resources.by_platform[var.slurm_nodeset_accounting.resource.platform][var.slurm_nodeset_accounting.resource.preset] : null
nfs = var.slurm_nodeset_nfs != null ? module.resources.by_platform[var.slurm_nodeset_nfs.resource.platform][var.slurm_nodeset_nfs.resource.preset] : null
}
# keep in sync with helm chart
# https://github.com/nebius/soperator/blob/main/helm/storageclasses/templates/storageclasses.yaml#L4
storage_class_prefix = "compute-csi"
slurm_cluster_name = "soperator"
flux_namespace = "flux-system"
k8s_cluster_name = format("soperator-%s", var.company_name)
backups_enabled = (var.backups_enabled == "force_enable" ||
(var.backups_enabled == "auto" && local.filestore_jail_calculated_size_gibibytes < 12 * 1024))
# Legacy node_group_workers for old-style deployments (without nodesets)
node_group_workers = flatten([for i, nodeset in var.slurm_nodeset_workers : [
for subset in range(ceil(nodeset.size / 100.0)) : {
size = min(100, nodeset.size - subset * 100)
max_unavailable_percent = 50
max_surge_percent = null
drain_timeout = null
resource = nodeset.resource
boot_disk = nodeset.boot_disk
gpu_cluster = nodeset.gpu_cluster
nodeset_index = i
subset_index = subset
preemptible = nodeset.preemptible
}
]])
# V2 node_group_workers for new-style deployments (with nodesets)
node_group_workers_v2 = flatten([for i, nodeset in var.slurm_nodeset_workers : [
for subset in range(ceil(nodeset.size / 100.0)) : {
name = nodeset.name
size = min(100, nodeset.size - subset * 100)
min_size = (
nodeset.autoscaling.enabled && nodeset.autoscaling.min_size != null
? min(100, max(0, nodeset.autoscaling.min_size - subset * 100)) # fill-first distribution
: min(100, nodeset.size - subset * 100) # min=max
)
max_size = max(1, min(100, nodeset.size - subset * 100))
autoscaling = nodeset.autoscaling.enabled
resource = nodeset.resource
boot_disk = nodeset.boot_disk
gpu_cluster = nodeset.gpu_cluster
nodeset_index = i
subset_index = subset
preemptible = nodeset.preemptible
reservation_policy = nodeset.reservation_policy
local_nvme = {
enabled = try(nodeset.local_nvme.enabled, false)
mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")
filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4")
}
}
]])
}
resource "terraform_data" "check_variables" {
depends_on = [
terraform_data.check_slurm_nodeset,
terraform_data.check_slurm_nodeset_accounting,
terraform_data.check_nfs,
terraform_data.check_local_nvme,
]
}
module "filestore" {
source = "../../modules/filestore"
depends_on = [
terraform_data.check_variables,
]
iam_project_id = data.nebius_iam_v1_project.this.id
k8s_cluster_name = local.k8s_cluster_name
controller_spool = {
spec = var.filestore_controller_spool.spec != null ? {
disk_type = "NETWORK_SSD"
size_gibibytes = var.filestore_controller_spool.spec.size_gibibytes
block_size_kibibytes = var.filestore_controller_spool.spec.block_size_kibibytes
} : null
existing = var.filestore_controller_spool.existing != null ? {
id = var.filestore_controller_spool.existing.id
} : null
}
accounting = var.accounting_enabled ? {
spec = var.filestore_accounting.spec != null ? {
disk_type = "NETWORK_SSD"
size_gibibytes = var.filestore_accounting.spec.size_gibibytes
block_size_kibibytes = var.filestore_accounting.spec.block_size_kibibytes
} : null
existing = var.filestore_accounting.existing != null ? {
id = var.filestore_accounting.existing.id
} : null
} : null
jail = {
spec = var.filestore_jail.spec != null ? {
disk_type = "NETWORK_SSD"
size_gibibytes = var.filestore_jail.spec.size_gibibytes
block_size_kibibytes = var.filestore_jail.spec.block_size_kibibytes
} : null
existing = var.filestore_jail.existing != null ? {
id = var.filestore_jail.existing.id
} : null
}
jail_submounts = [for submount in var.filestore_jail_submounts : {
name = submount.name
spec = submount.spec != null ? {
disk_type = "NETWORK_SSD"
size_gibibytes = submount.spec.size_gibibytes
block_size_kibibytes = submount.spec.block_size_kibibytes
} : null
existing = submount.existing != null ? {
id = submount.existing.id
} : null
}]
providers = {
nebius = nebius
units = units
}
}
module "nfs-server" {
count = var.nfs.enabled ? 1 : 0
source = "../../../modules/nfs-server"
parent_id = data.nebius_iam_v1_project.this.id
subnet_id = data.nebius_vpc_v1_subnet.this.id
platform = var.nfs.resource.platform
preset = var.nfs.resource.preset
instance_name = "${local.k8s_cluster_name}-nfs-server"
nfs_disk_name_suffix = local.k8s_cluster_name
nfs_ip_range = data.nebius_vpc_v1_subnet.this.status.ipv4_private_cidrs[0]
nfs_size = provider::units::from_gib(var.nfs.size_gibibytes)
nfs_path = "/nfs"
ssh_user_name = "soperator"
ssh_public_keys = var.slurm_login_ssh_root_public_keys
public_ip = var.nfs.public_ip
providers = {
nebius = nebius
}
}
module "cleanup" {
source = "../../modules/cleanup"
iam_project_id = var.iam_project_id
}
module "k8s_cleanup" {
source = "../../modules/k8s_cleanup"
k8s_cluster_context = module.k8s.cluster_context
k8s_cluster_id = module.k8s.cluster_id
depends_on = [
module.k8s,
]
}
module "k8s" {
depends_on = [
module.filestore,
module.nfs-server,
module.cleanup,
terraform_data.check_slurm_nodeset_accounting,
terraform_data.check_slurm_nodeset,
]
source = "../../modules/k8s"
iam_project_id = data.nebius_iam_v1_project.this.id
vpc_subnet_id = data.nebius_vpc_v1_subnet.this.id
login_public_ip = var.slurm_login_public_ip
k8s_version = var.k8s_version
name = local.k8s_cluster_name
company_name = var.company_name
platform_driver_presets = var.platform_driver_presets
use_preinstalled_gpu_drivers = var.use_preinstalled_gpu_drivers
etcd_cluster_size = var.etcd_cluster_size
node_group_system = var.slurm_nodeset_system
node_group_controller = var.slurm_nodeset_controller
node_group_workers = local.node_group_workers
node_group_workers_v2 = local.node_group_workers_v2
node_group_login = var.slurm_nodeset_login
node_group_accounting = {
enabled = var.accounting_enabled
spec = var.slurm_nodeset_accounting
}
node_group_nfs = {
enabled = var.slurm_nodeset_nfs != null
spec = var.slurm_nodeset_nfs
}
filestores = {
controller_spool = {
id = module.filestore.controller_spool.id
mount_tag = module.filestore.controller_spool.mount_tag
}
jail = {
id = module.filestore.jail.id
mount_tag = module.filestore.jail.mount_tag
}
jail_submounts = [for key, submount in module.filestore.jail_submounts : {
id = submount.id
mount_tag = submount.mount_tag
}]
accounting = var.accounting_enabled ? {
id = module.filestore.accounting.id
mount_tag = module.filestore.accounting.mount_tag
} : null
}
node_ssh_access_users = var.k8s_cluster_node_ssh_access_users
nvidia_admin_conf_lines = var.nvidia_admin_conf_lines
providers = {
nebius = nebius
units = units
}
}
module "nvidia_operator_network" {
count = module.k8s.gpu_involved && !var.use_preinstalled_gpu_drivers ? 1 : 0
depends_on = [
module.k8s
]
source = "../../../modules/network-operator"
cluster_id = module.k8s.cluster_id
parent_id = data.nebius_iam_v1_project.this.id
providers = {
nebius = nebius
}
}
module "nvidia_operator_gpu" {
count = module.k8s.gpu_involved && !var.use_preinstalled_gpu_drivers ? 1 : 0
depends_on = [
module.nvidia_operator_network
]
source = "../../../modules/gpu-operator"
cluster_id = module.k8s.cluster_id
parent_id = data.nebius_iam_v1_project.this.id
enable_dcgm_exporter = var.dcgm_job_mapping_enabled == false && var.telemetry_enabled
enable_dcgm_service_monitor = var.dcgm_job_mapping_enabled == false && var.telemetry_enabled
relabel_dcgm_exporter = var.telemetry_enabled
providers = {
nebius = nebius
}
}
module "o11y" {
count = var.public_o11y_enabled ? 1 : 0
depends_on = [
module.k8s,
module.fluxcd,
]
source = "../../modules/o11y"
iam_project_id = var.iam_project_id
o11y_iam_tenant_id = var.o11y_iam_tenant_id
o11y_profile = var.o11y_profile
k8s_cluster_context = module.k8s.cluster_context
company_name = var.company_name
}
module "slurm" {
depends_on = [
module.k8s,
module.o11y,
module.fluxcd,
module.backups,
]
source = "../../modules/slurm"
active_checks_scope = var.active_checks_scope
region = var.region
iam_tenant_id = var.iam_tenant_id
iam_project_id = var.iam_project_id
cluster_name = var.company_name
name = local.slurm_cluster_name
k8s_cluster_context = module.k8s.cluster_context
k8s_cluster_id = module.k8s.cluster_id
operator_version = var.slurm_operator_version
operator_stable = var.slurm_operator_stable
maintenance = var.maintenance
maintenance_ignore_node_groups = var.maintenance_ignore_node_groups
use_preinstalled_gpu_drivers = var.use_preinstalled_gpu_drivers
cuda_version = lookup(var.platform_cuda_versions, var.slurm_nodeset_workers[0].resource.platform)
controller_state_on_filestore = var.controller_state_on_filestore
node_count = {
controller = var.slurm_nodeset_controller.size
worker = [for workers in var.slurm_nodeset_workers : workers.size]
login = var.slurm_nodeset_login.size
}
resources = {
system = {
cpu_cores = local.resources.system.cpu_cores
memory_gibibytes = local.resources.system.memory_gibibytes
ephemeral_storage_gibibytes = floor(
var.slurm_nodeset_system.boot_disk.size_gibibytes * module.resources.k8s_ephemeral_storage_coefficient
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
}
controller = {
cpu_cores = local.resources.controller.cpu_cores
memory_gibibytes = floor(local.resources.controller.memory_gibibytes)
ephemeral_storage_gibibytes = floor(
var.slurm_nodeset_controller.boot_disk.size_gibibytes * module.resources.k8s_ephemeral_storage_coefficient
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
}
worker = [for i, worker in var.slurm_nodeset_workers :
{
cpu_cores = local.resources.workers[i].cpu_cores
memory_gibibytes = floor(local.resources.workers[i].memory_gibibytes)
ephemeral_storage_gibibytes = floor(
worker.boot_disk.size_gibibytes * module.resources.k8s_ephemeral_storage_coefficient
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
gpus = local.resources.workers[i].gpus
}
]
login = {
cpu_cores = local.resources.login.cpu_cores
memory_gibibytes = floor(local.resources.login.memory_gibibytes)
ephemeral_storage_gibibytes = floor(
var.slurm_nodeset_login.boot_disk.size_gibibytes * module.resources.k8s_ephemeral_storage_coefficient
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
}
accounting = var.accounting_enabled ? {
cpu_cores = local.resources.accounting.cpu_cores
memory_gibibytes = floor(local.resources.accounting.memory_gibibytes)
ephemeral_storage_gibibytes = floor(
var.slurm_nodeset_accounting.boot_disk.size_gibibytes * module.resources.k8s_ephemeral_storage_coefficient
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
} : null
nfs = var.slurm_nodeset_nfs != null ? {
cpu_cores = local.resources.nfs.cpu_cores
memory_gibibytes = floor(local.resources.nfs.memory_gibibytes)
} : null
}
filestores = {
controller_spool = {
size_gibibytes = module.filestore.controller_spool.size_gibibytes
device = module.filestore.controller_spool.mount_tag
}
jail = {
size_gibibytes = module.filestore.jail.size_gibibytes
device = module.filestore.jail.mount_tag
}
jail_submounts = [for submount in var.filestore_jail_submounts : {
name = submount.name
size_gibibytes = module.filestore.jail_submounts[submount.name].size_gibibytes
device = module.filestore.jail_submounts[submount.name].mount_tag
mount_path = submount.mount_path
}]
accounting = var.accounting_enabled ? {
size_gibibytes = module.filestore.accounting.size_gibibytes
device = module.filestore.accounting.mount_tag
} : null
}
node_local_jail_submounts = [for sm in var.node_local_jail_submounts : {
name = sm.name
mount_path = sm.mount_path
size_gibibytes = sm.size_gibibytes
disk_type = sm.disk_type
filesystem_type = sm.filesystem_type
storage_class_name = replace("${local.storage_class_prefix}-${lower(sm.disk_type)}-${lower(sm.filesystem_type)}", "_", "-")
}]
node_local_image_storage = {
enabled = var.node_local_image_disk.enabled
spec = var.node_local_image_disk.enabled ? {
size_gibibytes = var.node_local_image_disk.spec.size_gibibytes
filesystem_type = var.node_local_image_disk.spec.filesystem_type
storage_class_name = replace("${local.storage_class_prefix}-${lower(var.node_local_image_disk.spec.disk_type)}-${lower(var.node_local_image_disk.spec.filesystem_type)}", "_", "-")
} : null
}
nfs = {
enabled = var.nfs.enabled
path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null
host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null
mount_path = var.nfs.enabled ? var.nfs.mount_path : null
}
nfs_in_k8s = {
enabled = var.nfs_in_k8s.enabled
version = var.nfs_in_k8s.version
size_gibibytes = var.nfs_in_k8s.size_gibibytes
storage_class = replace("compute-csi-${lower(var.nfs_in_k8s.disk_type)}-${lower(var.nfs_in_k8s.filesystem_type)}", "_", "-")
threads = var.nfs_in_k8s.threads
}
nfs_node_group_enabled = var.slurm_nodeset_nfs != null
exporter_enabled = var.slurm_exporter_enabled
rest_enabled = var.slurm_rest_enabled
accounting_enabled = var.accounting_enabled
telemetry_enabled = var.telemetry_enabled
public_o11y_enabled = var.public_o11y_enabled
soperator_notifier = var.soperator_notifier
backups_enabled = local.backups_enabled
backups_config = {
secret_name = local.backups_enabled ? module.backups[0].secret_name : null
password = var.backups_password
schedule = var.backups_schedule
prune_schedule = var.backups_prune_schedule
retention = var.backups_retention
storage = {
bucket : local.backups_enabled ? module.backups_store[0].name : null,
endpoint : local.backups_enabled ? module.backups_store[0].endpoint : null,
bucket_id : local.backups_enabled ? module.backups_store[0].bucket_id : null
}
}
slurmdbd_config = var.slurmdbd_config
slurm_accounting_config = var.slurm_accounting_config
use_default_apparmor_profile = var.use_default_apparmor_profile
worker_sshd_config_map_ref_name = var.slurm_worker_sshd_config_map_ref_name
shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes
slurm_partition_config_type = var.slurm_partition_config_type
slurm_partition_raw_config = var.slurm_partition_raw_config
slurm_health_check_config = var.slurm_health_check_config
slurm_nodesets_partitions = var.slurm_nodesets_partitions
worker_nodesets = [for nodeset in var.slurm_nodeset_workers : {
name = nodeset.name
replicas = nodeset.size
max_unavailable = "20%"
features = concat(
[
provider::string-functions::snake_case(nodeset.resource.platform),
provider::string-functions::snake_case(nodeset.boot_disk.type),
],
nodeset.features != null ? nodeset.features : []
)
cpu_topology = module.resources.cpu_topology_by_platform[nodeset.resource.platform][nodeset.resource.preset]
gres_name = lookup(module.resources.gres_name_by_platform, nodeset.resource.platform, null)
gres_config = lookup(module.resources.gres_config_by_platform, nodeset.resource.platform, null)
create_partition = nodeset.create_partition != null ? nodeset.create_partition : false
ephemeral_nodes = nodeset.ephemeral_nodes
local_nvme = {
enabled = try(nodeset.local_nvme.enabled, false)
mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")
filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4")
}
}]
login_allocation_id = module.k8s.static_ip_allocation_id
login_public_ip = var.slurm_login_public_ip
tailscale_enabled = var.tailscale_enabled
login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name
sssd_conf_secret_ref_name = var.slurm_sssd_conf_secret_ref_name
sssd_ldap_ca_config_map_ref_name = var.slurm_sssd_ldap_ca_config_map_ref_name
sssd_enabled = var.slurm_sssd_enabled
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys
flux_namespace = local.flux_namespace
providers = {
helm = helm
}
}
module "login_script" {
depends_on = [
module.slurm
]
source = "../../modules/login"
slurm_cluster_name = local.slurm_cluster_name
k8s_cluster_context = module.k8s.cluster_context
providers = {
kubernetes = kubernetes
}
}
module "backups_store" {
count = local.backups_enabled ? 1 : 0
source = "../../modules/backups_store"
iam_project_id = var.iam_project_id
instance_name = local.k8s_cluster_name
cleanup_bucket_on_destroy = var.cleanup_bucket_on_destroy
depends_on = [
module.k8s,
module.fluxcd,
]
}
module "backups" {
count = local.backups_enabled ? 1 : 0
source = "../../modules/backups"
k8s_cluster_context = module.k8s.cluster_context
k8s_cluster_id = module.k8s.cluster_id
iam_project_id = var.iam_project_id
iam_tenant_id = var.iam_tenant_id
instance_name = local.k8s_cluster_name
soperator_namespace = local.slurm_cluster_name
backups_password = var.backups_password
providers = {
nebius = nebius
}
depends_on = [
module.k8s,
]
}
module "fluxcd" {
depends_on = [
module.k8s,
]
source = "../../modules/fluxcd"
k8s_cluster_context = module.k8s.cluster_context
}