Skip to content

Commit 8c98e16

Browse files
committed
merge conflicts
2 parents 533d7c5 + a769015 commit 8c98e16

File tree

32 files changed

+827
-51
lines changed

32 files changed

+827
-51
lines changed

.github/workflows/doca.yml renamed to .github/workflows/extra.yml

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test DOCA extra build
1+
name: Test extra build
22
on:
33
workflow_dispatch:
44
push:
@@ -7,16 +7,18 @@ on:
77
paths:
88
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
99
- 'ansible/roles/doca/**'
10-
- '.github/workflows/doca'
10+
- 'ansible/roles/cuda/**'
11+
- '.github/workflows/extra.yml'
1112
pull_request:
1213
paths:
1314
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
1415
- 'ansible/roles/doca/**'
15-
- '.github/workflows/doca'
16+
- 'ansible/roles/cuda/**'
17+
- '.github/workflows/extra.yml'
1618

1719
jobs:
1820
doca:
19-
name: doca-build
21+
name: extra-build
2022
concurrency:
2123
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
2224
cancel-in-progress: true
@@ -25,12 +27,14 @@ jobs:
2527
fail-fast: false # allow other matrix jobs to continue even if one fails
2628
matrix: # build RL8, RL9
2729
build:
28-
- image_name: openhpc-doca-RL8
30+
- image_name: openhpc-extra-RL8
2931
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
30-
inventory_groups: doca
31-
- image_name: openhpc-doca-RL9
32+
inventory_groups: doca,cuda
33+
volume_size: 30 # needed for cuda
34+
- image_name: openhpc-extra-RL9
3235
source_image_name_key: RL9
33-
inventory_groups: doca
36+
inventory_groups: doca,cuda
37+
volume_size: 30 # needed for cuda
3438
env:
3539
ANSIBLE_FORCE_COLOR: True
3640
OS_CLOUD: openstack
@@ -95,6 +99,7 @@ jobs:
9599
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
96100
-var "image_name=${{ matrix.build.image_name }}" \
97101
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
102+
-var "volume_size=${{ matrix.build.volume_size }}" \
98103
openstack.pkr.hcl
99104
100105
- name: Get created image names from manifest

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ roles/*
5858
!roles/squid/**
5959
!roles/tuned/
6060
!roles/tuned/**
61+
!roles/compute_init/
62+
!roles/compute_init/**
6163
!roles/k3s/
6264
!roles/k3s/**
6365
!roles/k9s/

ansible/cleanup.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,4 @@
6666
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
6767

6868
- name: Show image summary
69-
debug:
70-
var: image_info
69+
command: cat /var/lib/image/image.json

ansible/extras.yml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
gather_facts: yes
2525
tags: cuda
2626
tasks:
27-
- import_role:
27+
- include_role:
2828
name: cuda
29+
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
2930

3031
- name: Persist hostkeys across rebuilds
3132
# Must be after filesystems.yml (for storage)
@@ -37,10 +38,32 @@
3738
- import_role:
3839
name: persist_hostkeys
3940

41+
42+
- name: Setup NFS export for compute node configuration
43+
hosts: compute_init:!builder
44+
# NB: has to be after eeesi and os-manila-mount
45+
tags: compute_init
46+
become: yes
47+
name: Export hostvars
48+
tasks:
49+
- include_role:
50+
name: compute_init
51+
tasks_from: export.yml
52+
4053
- name: Install k9s
4154
become: yes
4255
hosts: k9s
4356
tags: k9s
4457
tasks:
4558
- import_role:
4659
name: k9s
60+
61+
- hosts: extra_packages
62+
become: yes
63+
tags:
64+
- extra_packages
65+
tasks:
66+
- name: Install additional packages
67+
dnf:
68+
name: "{{ appliances_extra_packages }}"
69+
when: appliances_mode != 'configure' or appliances_extra_packages_during_configure

ansible/fatimage.yml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@
2929

3030
- import_playbook: bootstrap.yml
3131

32+
- hosts: doca
33+
become: yes
34+
gather_facts: yes
35+
tasks:
36+
- name: Install NVIDIA DOCA
37+
import_role:
38+
name: doca
39+
3240
- name: Run post-bootstrap.yml hook
3341
vars:
3442
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
@@ -65,6 +73,16 @@
6573

6674
- import_playbook: extras.yml
6775

76+
# TODO: is this the right place?
77+
- name: Install compute_init script
78+
hosts: compute_init
79+
tags: compute_init # tagged to allow running on cluster instances for dev
80+
become: yes
81+
tasks:
82+
- include_role:
83+
name: compute_init
84+
tasks_from: install.yml
85+
6886
- hosts: builder
6987
become: yes
7088
gather_facts: yes
@@ -220,15 +238,15 @@
220238
import_role:
221239
name: doca
222240

223-
- import_playbook: disable-repos.yml
224-
225241
- name: Run post.yml hook
226242
vars:
227243
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
228244
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
229245
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
230246
when: hook_path | exists
231247

248+
- import_playbook: disable-repos.yml
249+
232250
- hosts: builder
233251
become: yes
234252
gather_facts: yes

ansible/filter_plugins/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ def to_ood_regex(items):
4949
return '|'.join(r)
5050

5151
def appliances_repo_to_subpath(repo_entry):
52-
return repo_entry['path']+'/'+repo_entry['timestamp']
52+
""" Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same
53+
"""
54+
return repo_entry['path'] + '/' + repo_entry['timestamp']
5355

5456
class FilterModule(object):
5557
''' Ansible core jinja2 filters '''

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" {
399399
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
400400
{% endif %}
401401
{% endfor %}
402-
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
402+
control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
403403
k3s_token = "{{ k3s_token }}"
404404
}
405405
}
@@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
565565
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
566566
{% endif %}
567567
{% endfor %}
568-
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
568+
control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
569569
k3s_token = "{{ k3s_token }}"
570570
}
571571
}
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# EXPERIMENTAL: compute-init
2+
3+
Experimental / in-progress functionality to allow compute nodes to rejoin the
4+
cluster after a reboot.
5+
6+
To enable this add compute nodes (or a subset of them into) the `compute_init`
7+
group.
8+
9+
This works as follows:
10+
1. During image build, an ansible-init playbook and supporting files
11+
(e.g. templates, filters, etc) are installed.
12+
2. Cluster instances are created as usual; the above compute-init playbook does
13+
not run.
14+
3. The `site.yml` playbook is run as usual to configure all the instances into
15+
a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS
16+
share is created on the control node containing:
17+
- an /etc/hosts file for the cluster
18+
- Hostvars for each compute node
19+
4. On reboot of a compute node, ansible-init runs the compute-init playbook
20+
which:
21+
a. Checks whether the `enable_compute` metadata flag is set, and exits if
22+
not.
23+
b. Tries to mount the above `/exports/cluster` NFS share from the control
24+
node, and exits if it cannot.
25+
c. Configures itself using the exported hostvars, depending on the
26+
`enable_*` flags set in metadata.
27+
d. Issues an `scontrol` command to resume the node (because Slurm will
28+
consider it as "unexpectedly rebooted").
29+
30+
The check in 4b. above is what prevents the compute-init script from trying
31+
to configure the node before the services on the control node are available
32+
(which requires running the site.yml playbook).
33+
34+
The following roles/groups are currently fully functional:
35+
- `resolv_conf`: all functionality
36+
- `etc_hosts`: all functionality
37+
- `nfs`: client functionality only
38+
- `manila`: all functionality
39+
- `basic_users`: all functionality, assumes home directory already exists on
40+
shared storage
41+
- `eessi`: all functionality, assumes `cvmfs_config` is the same on control
42+
node and all compute nodes.
43+
- `openhpc`: all functionality
44+
45+
# Development/debugging
46+
47+
To develop/debug this without actually having to build an image:
48+
49+
50+
1. Deploy a cluster using tofu and ansible/site.yml as normal. This will
51+
additionally configure the control node to export compute hostvars over NFS.
52+
Check the cluster is up.
53+
54+
2. Reimage the compute nodes:
55+
56+
ansible-playbook --limit compute ansible/adhoc/rebuild.yml
57+
58+
3. Add metadata to a compute node e.g. via Horizon to turn on compute-init
59+
playbook functionality.
60+
61+
4. Fake an image build to deploy the compute-init playbook:
62+
63+
ansible-playbook ansible/fatimage.yml --tags compute_init
64+
65+
NB: This will also re-export the compute hostvars, as the nodes are not
66+
in the builder group, which conveniently means any changes made to that
67+
play also get picked up.
68+
69+
5. Fake a reimage of compute to run ansible-init and the compute-init playbook:
70+
71+
On compute node where metadata was added:
72+
73+
[root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init
74+
[root@rl9-compute-0 rocky]# systemctl status ansible-init
75+
76+
Use `systemctl status ansible-init` to view stdout/stderr from Ansible.
77+
78+
Steps 4/5 can be repeated with changes to the compute script. If required,
79+
reimage the compute node(s) first as in step 2 and/or add additional metadata
80+
as in step 3.
81+
82+
83+
# Design notes
84+
- Duplicating code in roles into the `compute-init` script is unfortunate, but
85+
does allow developing this functionality without wider changes to the
86+
appliance.
87+
88+
- In general, we don't want to rely on NFS export. So should e.g. copy files
89+
from this mount ASAP in the compute-init script. TODO:
90+
91+
- There are a couple of approaches to supporting existing roles using `compute-init`:
92+
93+
1. Control node copies files resulting from role into cluster exports,
94+
compute-init copies to local disk. Only works if files are not host-specific
95+
Examples: etc_hosts, eessi config?
96+
97+
2. Re-implement the role. Works if the role vars are not too complicated,
98+
(else they all need to be duplicated in compute-init). Could also only
99+
support certain subsets of role functionality or variables
100+
Examples: resolv_conf, stackhpc.openhpc
101+
102+
- Some variables are defined using hostvars from other nodes, which aren't
103+
available v the current approach:
104+
105+
```
106+
[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml
107+
"grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}",
108+
"grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}",
109+
"mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}",
110+
"nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}",
111+
"openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}",
112+
"openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}",
113+
"openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}",
114+
"openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}",
115+
"prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}",
116+
"{{ hostvars[groups['freeipa_server'].0].ansible_host }}"
117+
```
118+
119+
More generally, there is nothing to stop any group var depending on a
120+
"{{ hostvars[] }}" interpolation ...
121+
122+
Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern
123+
for compute nodes - both of these indirect via `api_address` to
124+
`inventory_hostname`. This has been worked around by replacing this with
125+
"{{ groups['control'] | first }}" which does result in the control node
126+
inventory hostname when templating.
127+
128+
Note that although `groups` is defined in the templated hostvars, when
129+
the hostvars are loaded using `include_vars:` is is ignored as it is a
130+
"magic variable" determined by ansible itself and cannot be set.

0 commit comments

Comments
 (0)