Skip to content

Commit c392db0

Browse files
authored
Merge branch 'feat/slurm-rebuild' into docs/compute-init-roles-v2
2 parents 050af5e + 2a0f3b7 commit c392db0

File tree

25 files changed

+306
-112
lines changed

25 files changed

+306
-112
lines changed

.github/workflows/stackhpc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,13 @@ jobs:
178178
ansible-playbook -v ansible/site.yml
179179
ansible-playbook -v ansible/ci/check_slurm.yml
180180
181-
- name: Test reimage of compute nodes and compute-init (via rebuild adhoc)
181+
- name: Test compute node reimage, compute-init, and reboot
182182
run: |
183183
. venv/bin/activate
184184
. environments/.stackhpc/activate
185185
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
186186
ansible-playbook -v ansible/ci/check_slurm.yml
187+
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
187188
188189
- name: Check sacct state survived reimage
189190
run: |

ansible/adhoc/reboot_via_slurm.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Reboot compute nodes via slurm. The nodes will be rebuilt if image in hostvars is different to the active one in OpenStack.
2+
# Example:
3+
# ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
4+
5+
- hosts: login
6+
become: yes
7+
gather_facts: no
8+
tasks:
9+
- name: Submit a Slurm job to reboot compute nodes
10+
ansible.builtin.shell: |
11+
set -e
12+
srun --reboot -N 2 uptime
13+
become_user: root
14+
register: slurm_result
15+
failed_when: slurm_result.rc != 0
16+
17+
- name: Fetch Slurm logs if reboot fails
18+
ansible.builtin.shell: |
19+
journalctl -u slurmctld --since "10 minutes ago" | tail -n 50
20+
become_user: root
21+
register: slurm_logs
22+
when: slurm_result.rc != 0
23+
delegate_to: "{{ groups['control'] | first }}"

ansible/roles/compute_init/tasks/export.yml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
file:
33
path: /exports/cluster
44
state: directory
5-
owner: root
5+
owner: slurm
66
group: root
7-
mode: u=rwX,go=
7+
mode: u=rX,g=rwX,o=
88
run_once: true
99
delegate_to: "{{ groups['control'] | first }}"
1010

@@ -23,21 +23,27 @@
2323
file:
2424
path: /exports/cluster/hostvars/{{ inventory_hostname }}/
2525
state: directory
26-
mode: u=rwX,go=
27-
# TODO: owner,mode,etc
26+
owner: slurm
27+
group: root
28+
mode: u=rX,g=rwX,o=
2829
delegate_to: "{{ groups['control'] | first }}"
2930

3031
- name: Template out hostvars
3132
template:
3233
src: hostvars.yml.j2
3334
dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml
34-
mode: u=rw,go=
35+
owner: slurm
36+
group: root
37+
mode: u=r,g=rw,o=
3538
delegate_to: "{{ groups['control'] | first }}"
3639

3740
- name: Copy manila share info to /exports/cluster
3841
copy:
3942
content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}"
4043
dest: /exports/cluster/manila_share_info.yml
44+
owner: root
45+
group: root
46+
mode: u=rw,g=r
4147
run_once: true
4248
delegate_to: "{{ groups['control'] | first }}"
4349
when: os_manila_mount_share_info is defined

ansible/roles/k3s/files/start_k3s.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
k3s_token: "{{ os_metadata.meta.k3s_token }}"
66
k3s_server_name: "{{ os_metadata.meta.control_address }}"
77
service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}"
8+
access_ip: "{{ os_metadata.meta.access_ip }}"
89
tasks:
910
- name: Ensure password directory exists
1011
ansible.builtin.file:
@@ -22,6 +23,13 @@
2223
path: "/etc/systemd/system/{{ service_name }}.service.env"
2324
line: "K3S_TOKEN={{ k3s_token }}"
2425

26+
- name: Add the node IP to the environment
27+
# NB this isn't natively setable via envvars, have to modify
28+
# INSTALL_K3S_EXEC to support it
29+
ansible.builtin.lineinfile:
30+
path: "/etc/systemd/system/{{ service_name }}.service.env"
31+
line: "K3S_NODE_IP={{ access_ip }}"
32+
2533
- name: Add server url to agents
2634
ansible.builtin.lineinfile:
2735
path: "/etc/systemd/system/{{ service_name }}.service.env"

ansible/roles/k3s/tasks/install.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
cmd: /usr/bin/k3s-install.sh
4848
environment:
4949
INSTALL_K3S_VERSION: "{{ k3s_version }}"
50-
INSTALL_K3S_EXEC: "{{ item }}"
50+
INSTALL_K3S_EXEC: "{{ item }} --node-ip=${K3S_NODE_IP}"
5151
INSTALL_K3S_SKIP_START: "true"
5252
INSTALL_K3S_SKIP_ENABLE: "true"
5353
INSTALL_K3S_BIN_DIR: "/usr/bin"

ansible/roles/rebuild/tasks/main.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,17 @@
44
file:
55
path: /etc/openstack
66
state: directory
7-
owner: root
7+
owner: slurm
88
group: root
9-
mode: '0400'
9+
mode: u=rX,g=rwX
1010

1111
- name: Copy out clouds.yaml
1212
copy:
1313
src: "{{ openhpc_rebuild_clouds }}"
1414
dest: /etc/openstack/clouds.yaml
15-
owner: root
15+
owner: slurm
1616
group: root
17-
mode: '0400'
17+
mode: u=r,g=rw
1818

1919
- name: Setup slurm tools
2020
include_role:

ansible/roles/slurm_tools/tasks/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
module_defaults:
2828
ansible.builtin.pip:
2929
virtualenv: /opt/slurm-tools
30-
virtualenv_command: python3 -m venv
30+
virtualenv_command: "{{ 'python3.9 -m venv' if ansible_distribution_major_version == '8' else 'python3 -m venv' }}"
3131
state: latest
3232
become: true
3333
become_user: "{{ pytools_user }}"

ansible/slurm.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
name: mysql
1111

1212
- name: Setup slurm-driven rebuild
13-
hosts: rebuild
13+
hosts: rebuild:!builder
1414
become: yes
1515
tags:
1616
- rebuild
17+
- openhpc
1718
tasks:
1819
- import_role:
1920
name: rebuild

docs/networks.md

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# Networking
2+
3+
The default OpenTofu configurations in the appliance do not provision networks,
4+
subnets or associated infrastructure such as routers. The requirements are that:
5+
1. At least one network exists.
6+
2. The first network defined spans all nodes, referred to as the "access network".
7+
3. Only one subnet per network is attached to nodes.
8+
4. At least one network on each node provides outbound internet access (either
9+
directly, or via a proxy).
10+
11+
Futhermore, it is recommended that the deploy host has an interface on the
12+
access network. While it is possible to e.g. use a floating IP on a login node
13+
as an SSH proxy to access the other nodes, this can create problems in recovering
14+
the cluster if the login node is unavailable and can make Ansible problems harder
15+
to debug.
16+
17+
This page describes supported configurations and how to implement them using
18+
the OpenTofu variables. These will normally be set in
19+
`environments/site/tofu/terraform.tfvars` for the site base environment. If they
20+
need to be overriden for specific environments, this can be done via an OpenTofu
21+
module as discussed [here](./production.md).
22+
23+
Note that if an OpenStack subnet has a gateway IP defined then nodes with ports
24+
attached to that subnet will get a default route set via that gateway.
25+
26+
## Single network
27+
This is the simplest possible configuration. A single network and subnet is
28+
used for all nodes. The subnet provides outbound internet access via the default
29+
route defined by the subnet gateway (often an OpenStack router to an external
30+
network).
31+
32+
```terraform
33+
cluster_networks = [
34+
{
35+
network = "netA"
36+
subnet = "subnetA"
37+
}
38+
]
39+
...
40+
```
41+
42+
## Multiple homogenous networks
43+
This is similar to the above, except each node has multiple networks. The first
44+
network, "netA" is the access network. Note that only one subnet must have a
45+
gateway defined, else default routes via both subnets will be present causing
46+
routing problems. It also shows the second network (netB) using direct-type
47+
vNICs for RDMA.
48+
49+
```terraform
50+
cluster_networks = [
51+
{
52+
network = "netA"
53+
subnet = "subnetA"
54+
},
55+
{
56+
network = "netB"
57+
subnet = "subnetB"
58+
},
59+
]
60+
61+
vnic_types = {
62+
netB = "direct"
63+
}
64+
...
65+
```
66+
67+
68+
## Additional networks on some nodes
69+
70+
This example shows how to modify variables for specific node groups. In this
71+
case a baremetal node group has a second network attached. As above, only a
72+
single subnet can have a gateway IP.
73+
74+
```terraform
75+
cluster_networks = [
76+
{
77+
network = "netA"
78+
subnet = "subnetA"
79+
}
80+
]
81+
82+
compute = {
83+
general = {
84+
nodes = ["general-0", "general-1"]
85+
}
86+
baremetal = {
87+
nodes = ["baremetal-0", "baremetal-1"]
88+
extra_networks = [
89+
{
90+
network = "netB"
91+
subnet = "subnetB"
92+
}
93+
]
94+
vnic_types = {
95+
netA = "baremetal"
96+
netB = "baremetal"
97+
...
98+
}
99+
}
100+
}
101+
...
102+
```

environments/.stackhpc/tofu/LEAFCLOUD.tfvars

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1-
cluster_net = "slurmapp-ci"
2-
cluster_subnet = "slurmapp-ci"
1+
cluster_networks = [
2+
{
3+
network = "slurmapp-ci"
4+
subnet = "slurmapp-ci"
5+
}
6+
]
37
control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment
48
other_node_flavor = "en1.xsmall"
59
state_volume_type = "unencrypted"

0 commit comments

Comments
 (0)