Skip to content

Commit 3166897

Browse files
Fixes gres on demand scheduling
By adding a full gres.conf to each node, the on demand scheduling is fixed. Also fixes bibilog and enables DebugFlag=Gres and TaskPlugin=task/cgroup #726
1 parent f7a13a8 commit 3166897

File tree

6 files changed

+27
-14
lines changed

6 files changed

+27
-14
lines changed

bibigrid/resources/bin/bibilog

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ fi
88
if [ "$2" == "fail" ]; then
99
fail_create="fail"
1010
else
11-
fail_create="create"
11+
fail_create="resume"
1212
fi
1313

1414
LOG="/var/log/slurm/worker_logs/$fail_create/$err_out"

bibigrid/resources/defaults/slurm/slurm.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ MpiDefault=none
1212
ProctrackType=proctrack/cgroup # linuxproc # changed for 23.11.0
1313
# ReturnToService=2
1414
SwitchType=switch/none
15-
TaskPlugin=task/none
16-
#TaskPlugin=task/cgroup
15+
TaskPlugin=task/cgroup
1716
JobAcctGatherType=jobacct_gather/linux
1817

1918
# RESOURCES
@@ -65,6 +64,7 @@ SlurmctldDebug=debug # info
6564
SlurmctldLogFile=/var/log/slurm/slurmctld.log
6665
SlurmdDebug=info
6766
SlurmdLogFile=/var/log/slurm/slurmd.log
67+
DebugFlags=Gres
6868

6969
# COMPUTE NODES
7070
# use_master_as_compute

bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@
3434
state: restarted
3535
when: "'master' in group_names"
3636

37+
- name: Enable Slurmd
38+
systemd:
39+
name: "slurmd"
40+
enabled: true
41+
masked: false
42+
state: started
43+
daemon_reload: true
44+
3745
- name: Slurmd
3846
systemd:
3947
name: slurmd

bibigrid/resources/playbook/roles/bibigrid/tasks/042-slurm.yaml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,4 @@
108108
group: root
109109
mode: "0o444"
110110
force: true
111-
when: flavor.gres is defined
112-
113-
- name: Enable slurmd services
114-
systemd:
115-
name: "slurmd"
116-
enabled: true
117-
masked: false
118-
state: started
119-
daemon_reload: true
111+
notify: Enable Slurmd
Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
# GRES CONFIG
2+
{% for node_name in groups.master + groups.workers %}
3+
{% set node = hostvars[node_name] %}
24
{% set ns = namespace(device_index=0) %}
3-
{% for gres in flavor.gres %}
5+
{% if node.flavor.gres is defined %}
6+
{% for gres in node.flavor.gres %}
47
{% for i in range(gres.count | int) %}
5-
Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ ns.device_index }}
8+
NodeName={{ node.name }} Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ ns.device_index }}
69
{% set ns.device_index = ns.device_index + 1 %}
710
{% endfor %}
11+
{% endfor %}
12+
{% endif %}
813
{% endfor %}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# GRES CONFIG
2+
{% set ns = namespace(device_index=0) %}
3+
{% for gres in flavor.gres %}
4+
{% for i in range(gres.count | int) %}
5+
Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ ns.device_index }}
6+
{% set ns.device_index = ns.device_index + 1 %}
7+
{% endfor %}
8+
{% endfor %}

0 commit comments

Comments
 (0)