Skip to content

Commit 47bb230

Browse files
committed
CI tests for reboot via slurm (without rebuild)
1 parent fb324e6 commit 47bb230

File tree

2 files changed

+25
-1
lines changed

2 files changed

+25
-1
lines changed

.github/workflows/stackhpc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,13 @@ jobs:
178178
ansible-playbook -v ansible/site.yml
179179
ansible-playbook -v ansible/ci/check_slurm.yml
180180
181-
- name: Test reimage of compute nodes and compute-init (via rebuild adhoc)
181+
- name: Test compute node reimage, compute-init, and reboot
182182
run: |
183183
. venv/bin/activate
184184
. environments/.stackhpc/activate
185185
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
186186
ansible-playbook -v ansible/ci/check_slurm.yml
187+
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
187188
188189
- name: Check sacct state survived reimage
189190
run: |

ansible/adhoc/reboot_via_slurm.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Reboot compute nodes via slurm. The nodes will be rebuilt if image in hostvars is different to the active one in OpenStack.
2+
# Example:
3+
# ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
4+
5+
- hosts: login
6+
become: yes
7+
gather_facts: no
8+
tasks:
9+
- name: Submit a Slurm job to reboot compute nodes
10+
ansible.builtin.shell: |
11+
set -e
12+
srun --reboot -N 2 uptime
13+
become_user: root
14+
register: slurm_result
15+
failed_when: slurm_result.rc != 0
16+
17+
- name: Fetch Slurm logs if reboot fails
18+
ansible.builtin.shell: |
19+
journalctl -u slurmctld --since "10 minutes ago" | tail -n 50
20+
become_user: root
21+
register: slurm_logs
22+
when: slurm_result.rc != 0
23+
delegate_to: "{{ groups['control'] | first }}"

0 commit comments

Comments
 (0)