Skip to content

Commit 3c6681e

Browse files
Fixing node memory handling (#655)
* Attempt using TRES_CORE_MEMORY * instead of //32 and capped at 2000 //4 +1000 sounds more reasonable * fixed equation * log warning if ram < 4096 * added unit * line too long
1 parent f1a97ae commit 3c6681e

File tree

2 files changed

+12
-7
lines changed

2 files changed

+12
-7
lines changed

bibigrid/core/utility/validate_configuration.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,8 @@ def validate(self):
208208
checks = [("master/vpn", self.check_master_vpn_worker), ("servergroup", self.check_server_group),
209209
("instances", self.check_instances), ("volumes", self.check_volumes), ("network", self.check_network),
210210
("quotas", self.check_quotas), ("sshPublicKeyFiles", self.check_ssh_public_key_files),
211-
("cloudYamls", self.check_clouds_yamls), ("nfs", self.check_nfs), ("global security groups",
212-
self.check_configurations_security_groups)]
211+
("cloudYamls", self.check_clouds_yamls), ("nfs", self.check_nfs),
212+
("global security groups", self.check_configurations_security_groups)]
213213
if success:
214214
for check_name, check_function in checks:
215215
success = evaluate(check_name, check_function(), self.log) and success
@@ -223,7 +223,7 @@ def _check_security_groups(self, provider, security_groups):
223223
security_group = provider.get_security_group(security_group_name)
224224
if not security_group:
225225
self.log.warning(f"Couldn't find security group {security_group} on "
226-
f"cloud {provider.cloud_specification['identifier']}")
226+
f"cloud {provider.cloud_specification['identifier']}")
227227
success = False
228228
else:
229229
self.log.debug(f"Found {security_group_name} on cloud {provider.cloud_specification['identifier']}")
@@ -349,6 +349,10 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p
349349
(type_max_ram, image_min_ram, "ram")]:
350350
success = has_enough(maximum, needed, f"Type {instance_type}", thing, self.log) and success
351351
# prepare check quotas
352+
if type_max_ram < 4096:
353+
self.log.warning(
354+
f"Flavor {instance_type} on {provider.cloud_specification['identifier']} has {type_max_ram} but should "
355+
f"at least have 4096 MiB of RAM to efficiently run slurm and jobs!")
352356
self.required_resources_dict[provider.cloud_specification['identifier']]["total_ram"] += type_max_ram
353357
self.required_resources_dict[provider.cloud_specification['identifier']]["total_cores"] += flavor["vcpus"]
354358
return success

resources/defaults/slurm/slurm.j2

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Waittime=0
4949
# SCHEDULING
5050
SchedulerType=sched/backfill
5151
SelectType=select/cons_tres
52-
SelectTypeParameters=CR_Core
52+
SelectTypeParameters=CR_Core_Memory
5353

5454
# ACCOUNTING
5555
AccountingStorageType=accounting_storage/slurmdbd
@@ -75,8 +75,9 @@ SlurmdLogFile=/var/log/slurm/slurmd.log
7575
{% set _ = exclude_groups.append(node.name) %}
7676
{% endif %}
7777
{% set _ = node_groups.append(node.name) %}
78-
{% set mem = (node.flavor.ram // 1024) * 1000 %}
79-
NodeName={{ node.name }} SocketsPerBoard={{ node.flavor.vcpus }} CoresPerSocket=1 RealMemory={{ mem - [mem // 2, 16000] | min }} State={{node.state }} {{"Features=" + (node.features | join(",")) if node.features is defined }}# {{ node.cloud_identifier }}
78+
{# MiB to MB #}
79+
{% set mem = (node.flavor.ram * 1.048576) | int %}
80+
NodeName={{ node.name }} SocketsPerBoard={{ node.flavor.vcpus }} CoresPerSocket=1 RealMemory={{ mem }} MemSpecLimit={{ [mem//4 + 1000, 8000] | min }} State={{node.state }} {{"Features=" + (node.features | join(",")) if node.features is defined }}# {{ node.cloud_identifier }}
8081
{% for partition in node.partitions %}
8182
{% if partition not in partitions %}
8283
{% set _ = partitions.update({partition: []}) %}
@@ -118,4 +119,4 @@ ResumeFailProgram=/opt/slurm/fail.sh
118119
# job container
119120
# TO BE TESTED
120121
JobContainerType=job_container/tmpfs
121-
PrologFlags=Contain
122+
PrologFlags=Contain

0 commit comments

Comments
 (0)