Skip to content

Commit c20ef90

Browse files
Fix error with short names in topology
1 parent 092a48b commit c20ef90

File tree

2 files changed

+47
-10
lines changed

2 files changed

+47
-10
lines changed

playbooks/roles/slurm/tasks/compute-rack-aware.yml

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,30 +118,67 @@
118118
run_once: true
119119

120120
- name: Get current nodes in Switch
121-
shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"{{item}}\" | grep Nodes= | awk '{ print $2}' | cut -c 7- | tr '\n' ',' | sed 's/,$/\\n/'"
121+
shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"{{item}}\" | grep Nodes="
122122
register: nodes_in_switch
123123
delegate_to: 127.0.0.1
124124
run_once: true
125125
with_items: "{{racks_to_add}}"
126+
ignore_errors: yes
126127

127-
- name: Get hostlist
128-
vars:
129-
new_line: "{{item.stdout}}{% if item.stdout != ''%},{% endif%}{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if hostvars[node]['rackID'] == item.item.split(':')[1] and (not ((hostvars[node]['ansible_hostname'] in item.stdout.split(',')|list)|bool)) %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
130-
command: "scontrol show hostlistsorted {{new_line[:-1]}}"
131-
register: rack_hostlist
128+
- name: Get current nodes in Switch hostlist
129+
vars:
130+
- switch_list_condensed: "{{item.stdout.split('Nodes=')[1]}}"
131+
command: "scontrol show hostname {{switch_list_condensed }}"
132+
register: switch_hostlist
132133
delegate_to: 127.0.0.1
133134
with_items: "{{nodes_in_switch.results}}"
135+
when: item.rc == 0
136+
137+
- name: Get hostlist if switch exists
138+
vars:
139+
new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
140+
command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}"
141+
register: rack_hostlist1
142+
delegate_to: 127.0.0.1
143+
with_items: "{{switch_hostlist.results}}"
144+
run_once: true
145+
when: item.item.rc == 0
146+
147+
- name: Get hostlist if switch does not exists
148+
vars:
149+
new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
150+
command: "scontrol show hostlistsorted {{ new_line[:-1] }}"
151+
register: rack_hostlist2
152+
delegate_to: 127.0.0.1
153+
with_items: "{{switch_hostlist.results}}"
154+
run_once: true
155+
when: item.item.rc > 0
156+
157+
- name: get Nodes on switch
158+
set_fact:
159+
nodes_on_switches: "{{nodes_on_switches | default({}) | combine({item.item.item.item : item.stdout } ) }}"
160+
with_items: "{{rack_hostlist1.results}}"
134161
run_once: true
162+
delegate_to: 127.0.0.1
163+
when: item.item.item.rc== 0
135164

165+
- name: get Nodes on switch
166+
set_fact:
167+
nodes_on_switches: "{{nodes_on_switches | default({}) | combine({item.item.item.item : item.stdout } ) }}"
168+
with_items: "{{rack_hostlist2.results}}"
169+
run_once: true
170+
delegate_to: 127.0.0.1
171+
when: item.item.item.rc > 0
172+
136173
- name: Add the nodes in the rack switches
137174
become: true
138175
lineinfile:
139176
path: "{{ slurm_conf_path }}/topology.conf"
140177
regexp: "SwitchName={{item.item.item}}\\sNodes.*"
141-
line: "SwitchName={{item.item.item}} Nodes={{item.stdout}}"
178+
line: "SwitchName={{item.item.item}} Nodes={{nodes_on_switches[item.item.item]}}"
142179
state: present
143180
run_once: true
144-
with_items: "{{rack_hostlist.results}}"
181+
with_items: "{{switch_hostlist.results}}"
145182
delegate_to: 127.0.0.1
146183
notify: reconfigure slurm
147184

playbooks/roles/slurm/tasks/compute.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,12 @@
9898
run_once: true
9999
delegate_to: 127.0.0.1
100100
- name: Get cluster_hostlist
101-
command: "scontrol show hostname {{cluster_switch.stdout.split('Nodes=')[1].split(',')}}"
101+
command: "scontrol show hostname {{cluster_switch.stdout.split('Nodes=')[1]}}"
102102
register: cluster_hostlist
103103
run_once: true
104104
delegate_to: 127.0.0.1
105105
- name: Create new cluster_hostlist
106-
command: "scontrol show hostlistsorted {{cluster_hostlist.stdout | list | union(nodes_to_add) | join(',') }}"
106+
command: "scontrol show hostlistsorted {{cluster_hostlist.stdout_lines | union(nodes_to_add) | join(',') }}"
107107
register: cluster_hostlist_condensed_results
108108
delegate_to: 127.0.0.1
109109
rescue:

0 commit comments

Comments
 (0)