File tree Expand file tree Collapse file tree 2 files changed +25
-1
lines changed
Expand file tree Collapse file tree 2 files changed +25
-1
lines changed Original file line number Diff line number Diff line change @@ -178,12 +178,13 @@ jobs:
178178 ansible-playbook -v ansible/site.yml
179179 ansible-playbook -v ansible/ci/check_slurm.yml
180180
181- - name : Test reimage of compute nodes and compute-init (via rebuild adhoc)
181+ - name : Test compute node reimage, compute-init, and reboot
182182 run : |
183183 . venv/bin/activate
184184 . environments/.stackhpc/activate
185185 ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
186186 ansible-playbook -v ansible/ci/check_slurm.yml
187+ ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
187188
188189 - name : Check sacct state survived reimage
189190 run : |
Original file line number Diff line number Diff line change 1+ # Reboot compute nodes via slurm. The nodes will be rebuilt if image in hostvars is different to the active one in OpenStack.
2+ # Example:
3+ # ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
4+
5+ - hosts : login
6+ become : yes
7+ gather_facts : no
8+ tasks :
9+ - name : Submit a Slurm job to reboot compute nodes
10+ ansible.builtin.shell : |
11+ set -e
12+ srun --reboot -N 2 uptime
13+ become_user : root
14+ register : slurm_result
15+ failed_when : slurm_result.rc != 0
16+
17+ - name : Fetch Slurm logs if reboot fails
18+ ansible.builtin.shell : |
19+ journalctl -u slurmctld --since "10 minutes ago" | tail -n 50
20+ become_user : root
21+ register : slurm_logs
22+ when : slurm_result.rc != 0
23+ delegate_to : " {{ groups['control'] | first }}"
You can’t perform that action at this time.
0 commit comments