Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,13 @@ jobs:
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml

- name: Test reimage of compute nodes and compute-init (via rebuild adhoc)
- name: Test compute node reimage, compute-init, and reboot
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
ansible-playbook -v ansible/ci/check_slurm.yml
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml

- name: Check sacct state survived reimage
run: |
Expand Down
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,5 @@ roles/*
!roles/slurm_stats/**
!roles/pytools/
!roles/pytools/**
!roles/rebuild/
!roles/rebuild/**
23 changes: 23 additions & 0 deletions ansible/adhoc/reboot_via_slurm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Reboot compute nodes via slurm. The nodes will be rebuilt if image in hostvars is different to the active one in OpenStack.
# Example:
# ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml

- hosts: login
become: yes
gather_facts: no
tasks:
- name: Submit a Slurm job to reboot compute nodes
ansible.builtin.shell: |
set -e
srun --reboot -N 2 uptime
become_user: root
register: slurm_result
failed_when: slurm_result.rc != 0

- name: Fetch Slurm logs if reboot fails
ansible.builtin.shell: |
journalctl -u slurmctld --since "10 minutes ago" | tail -n 50
become_user: root
register: slurm_logs
when: slurm_result.rc != 0
delegate_to: "{{ groups['control'] | first }}"
16 changes: 11 additions & 5 deletions ansible/roles/compute_init/tasks/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
file:
path: /exports/cluster
state: directory
owner: root
owner: slurm
group: root
mode: u=rwX,go=
mode: u=rX,g=rwX,o=
run_once: true
delegate_to: "{{ groups['control'] | first }}"

Expand All @@ -23,21 +23,27 @@
file:
path: /exports/cluster/hostvars/{{ inventory_hostname }}/
state: directory
mode: u=rwX,go=
# TODO: owner,mode,etc
owner: slurm
group: root
mode: u=rX,g=rwX,o=
delegate_to: "{{ groups['control'] | first }}"

- name: Template out hostvars
template:
src: hostvars.yml.j2
dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml
mode: u=rw,go=
owner: slurm
group: root
mode: u=r,g=rw,o=
delegate_to: "{{ groups['control'] | first }}"

- name: Copy manila share info to /exports/cluster
copy:
content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}"
dest: /exports/cluster/manila_share_info.yml
owner: root
group: root
mode: u=rw,g=r
run_once: true
delegate_to: "{{ groups['control'] | first }}"
when: os_manila_mount_share_info is defined
Expand Down
2 changes: 2 additions & 0 deletions ansible/roles/rebuild/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml
21 changes: 21 additions & 0 deletions ansible/roles/rebuild/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---

- name: Create /etc/openstack
file:
path: /etc/openstack
state: directory
owner: slurm
group: root
mode: u=rX,g=rwX

- name: Copy out clouds.yaml
copy:
src: "{{ openhpc_rebuild_clouds }}"
dest: /etc/openstack/clouds.yaml
owner: slurm
group: root
mode: u=r,g=rw

- name: Setup slurm tools
include_role:
name: slurm_tools
29 changes: 0 additions & 29 deletions ansible/roles/slurm_tools/.travis.yml

This file was deleted.

2 changes: 1 addition & 1 deletion ansible/roles/slurm_tools/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
module_defaults:
ansible.builtin.pip:
virtualenv: /opt/slurm-tools
virtualenv_command: python3 -m venv
virtualenv_command: "{{ 'python3.9 -m venv' if ansible_distribution_major_version == '8' else 'python3 -m venv' }}"
state: latest
become: true
become_user: "{{ pytools_user }}"
10 changes: 10 additions & 0 deletions ansible/slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
- include_role:
name: mysql

- name: Setup slurm-driven rebuild
hosts: rebuild:!builder
become: yes
tags:
- rebuild
- openhpc
tasks:
- import_role:
name: rebuild

- name: Setup slurm
hosts: openhpc
become: yes
Expand Down
7 changes: 3 additions & 4 deletions environments/.stackhpc/inventory/extra_groups
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
[basic_users:children]
cluster

[rebuild:children]
control
compute

[etc_hosts:children]
cluster

Expand Down Expand Up @@ -35,3 +31,6 @@ builder
[sssd:children]
# Install sssd into fat image
builder

[rebuild:children]
control
8 changes: 6 additions & 2 deletions environments/.stackhpc/tofu/SMS.tfvars
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
cluster_net = "stackhpc-ipv4-geneve"
cluster_subnet = "stackhpc-ipv4-geneve-subnet"
cluster_networks = [
{
network = "stackhpc-ipv4-geneve"
subnet = "stackhpc-ipv4-geneve-subnet"
}
]
control_node_flavor = "general.v1.small"
other_node_flavor = "general.v1.small"
2 changes: 1 addition & 1 deletion environments/.stackhpc/tofu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ module "cluster" {
nodes: ["compute-0", "compute-1"]
flavor: var.other_node_flavor
compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"]
# ignore_image_changes: true
ignore_image_changes: true
}
# Example of how to add another partition:
# extra: {
Expand Down
8 changes: 7 additions & 1 deletion environments/common/inventory/group_vars/all/openhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,14 @@ openhpc_config_default:
SlurmctldParameters:
- enable_configless
TaskPlugin: task/cgroup,task/affinity
ReturnToService: 2
openhpc_config_rebuild:
RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild
SlurmctldParameters:
- reboot_from_controller
ResumeTimeout: 300
openhpc_config_extra: {}
openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}"
openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_rebuild if groups['rebuild'] | length > 0 else {}, openhpc_config_extra, list_merge='append') }}"
openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}"

openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326
Expand Down
6 changes: 3 additions & 3 deletions environments/common/inventory/groups
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ mysql
# All hosts requiring control of SELinux status.
cluster

[rebuild]
# Enable rebuild of nodes on an OpenStack cloud; add 'control' group plus 'compute' group or a subset of it.

[update]
# All hosts to (optionally) run yum update on.

Expand Down Expand Up @@ -165,3 +162,6 @@ extra_packages

[pulp]
# Add builder to this group to enable automatically syncing of pulp during image build

[rebuild]
# Enable rebuild of nodes on an OpenStack cloud; add 'control' group.
4 changes: 2 additions & 2 deletions environments/common/layouts/everything
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ control
[filebeat:children]
slurm_stats

# NB: [rebuild] not defined here as this template is used in CI

[update:children]

[fail2ban:children]
Expand Down Expand Up @@ -111,3 +109,5 @@ control
[extra_packages:children]
# Hosts to install specified additional packages on
builder

# NB: [rebuild] not defined here as likely to need features not currently supported
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ ${cluster_name}_${group_name}:
${ node.name }:
ansible_host: ${node.access_ip_v4}
instance_id: ${ node.id }
image_id: ${ node.image_id }
networks: ${jsonencode({for n in node.network: n.name => {"fixed_ip_v4": n.fixed_ip_v4, "fixed_ip_v6": n.fixed_ip_v6}})}
%{ endfor ~}
vars:
# NB: this is the target image, not necessarily what is provisioned
image_id: ${compute_groups[group_name]["image_id"]}
%{ endfor ~}

compute:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,7 @@ resource "openstack_compute_instance_v2" "compute" {
output "compute_instances" {
value = local.compute_instances
}

output "image_id" {
value = var.image_id
}