Skip to content

Commit 0800e6b

Browse files
Fix small issue with destroy race condition
1 parent a9295aa commit 0800e6b

File tree

2 files changed

+22
-20
lines changed

2 files changed

+22
-20
lines changed

playbooks/roles/slurm/tasks/destroy-rack-aware.yml

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,24 @@
5757
set_fact: nodes_to_remove="{{nodes_to_remove_temp_results.results | map(attribute='ansible_facts.nodes_to_remove_temp') | list}}"
5858
run_once: true
5959

60+
- name: Get new inactive_nodes list
61+
command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}"
62+
register: new_inactive_list
63+
run_once: true
64+
delegate_to: 127.0.0.1
65+
66+
- name: Adding nodes to inactive
67+
vars:
68+
- keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}"
69+
become: true
70+
lineinfile:
71+
path: "{{ slurm_conf_path }}/topology.conf"
72+
regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*"
73+
line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}"
74+
state: present
75+
run_once: true
76+
delegate_to: 127.0.0.1
77+
6078
- name: Run the script to get the RackID
6179
shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId'
6280
# shell: echo $RANDOM | md5sum | head -c 20
@@ -160,26 +178,9 @@
160178
delegate_to: 127.0.0.1
161179
when: racks_left_list | list | length > 0
162180

163-
- name: Get new inactive_nodes list
164-
command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}"
165-
register: new_inactive_list
166-
run_once: true
167-
delegate_to: 127.0.0.1
168-
169-
- name: Adding nodes to inactive
170-
vars:
171-
- keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}"
172-
become: true
173-
lineinfile:
174-
path: "{{ slurm_conf_path }}/topology.conf"
175-
regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*"
176-
line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}"
177-
state: present
178-
run_once: true
179-
delegate_to: 127.0.0.1
180-
181181
- name: Reconfigure Slurm for topology
182182
become: true
183183
command: "scontrol reconfigure"
184184
delegate_to: 127.0.0.1
185-
run_once: true
185+
run_once: true
186+
ignore_errors: yes

playbooks/roles/slurm/tasks/destroy.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,4 +120,5 @@
120120
become: true
121121
command: "scontrol reconfigure"
122122
delegate_to: 127.0.0.1
123-
run_once: true
123+
run_once: true
124+
ignore_errors: true

0 commit comments

Comments
 (0)