Skip to content

Commit 9ac9bac

Browse files
authored
Merge branch 'main' into ci/test-compute-init
2 parents 9af56d1 + fc38aab commit 9ac9bac

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+932
-367
lines changed

.github/workflows/nightlybuild.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ on:
1010
- LEAFCLOUD
1111
- SMS
1212
- ARCUS
13-
schedule:
14-
- cron: '0 0 * * *' # Run at midnight on default branch
13+
# schedule:
14+
# - cron: '0 0 * * *' # Run at midnight on default branch
1515

1616
jobs:
1717
openstack:

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@ It requires an OpenStack cloud, and an Ansible "deploy host" with access to that
3131

3232
Before starting ensure that:
3333
- You have root access on the deploy host.
34-
- You can create instances using a Rocky 9 GenericCloud image (or an image based on that).
35-
- **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI.
34+
- You can create instances from the [latest Slurm appliance image](https://github.com/stackhpc/ansible-slurm-appliance/releases), which already contains the required packages. This is built and tested in StackHPC's CI.
3635
- You have an SSH keypair defined in OpenStack, with the private key available on the deploy host.
3736
- Created instances have access to internet (note proxies can be setup through the appliance if necessary).
3837
- Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance).
@@ -50,6 +49,7 @@ These instructions assume the deployment host is running Rocky Linux 8:
5049
sudo yum install -y git python38
5150
git clone https://github.com/stackhpc/ansible-slurm-appliance
5251
cd ansible-slurm-appliance
52+
git checkout ${latest-release-tag}
5353
./dev/setup-env.sh
5454

5555
You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install/rpm/).

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,5 @@ roles/*
8686
!roles/rebuild/**
8787
!roles/slurm_tools/
8888
!roles/slurm_tools/**
89+
!roles/gateway/
90+
!roles/gateway/**

ansible/bootstrap.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,19 @@
5252
- import_role:
5353
name: proxy
5454

55+
- hosts: chrony
56+
tags: chrony
57+
become: yes
58+
tasks:
59+
- import_role:
60+
name: mrlesmithjr.chrony
61+
# skip install tasks as might not have network yet
62+
tasks_from: config_chrony.yml
63+
vars:
64+
# workaround for set_facts.yml:
65+
chrony_config: /etc/chrony.conf
66+
chrony_service: chronyd
67+
5568
- hosts: cluster
5669
gather_facts: false
5770
become: yes
@@ -306,10 +319,11 @@
306319
- include_role:
307320
name: azimuth_cloud.image_utils.linux_ansible_init
308321

309-
- hosts: k3s
322+
- hosts: k3s:&builder
310323
become: yes
311324
tags: k3s
312325
tasks:
313-
- ansible.builtin.include_role:
326+
- name: Install k3s
327+
ansible.builtin.include_role:
314328
name: k3s
315329
tasks_from: install.yml

ansible/cleanup.yml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,21 @@
3838

3939
- name: Cleanup /tmp
4040
command : rm -rf /tmp/*
41-
41+
42+
- name: Delete files triggering vulnerability scans
43+
ansible.builtin.file:
44+
path: "{{ item }}"
45+
state: absent
46+
loop: # NB: items here MUST have a justification!
47+
# ondemand install: raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw
48+
# All declared not to be an issue by Open Ondemand as relevant packages not installed
49+
- "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock"
50+
- "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock"
51+
- /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock
52+
# chrony role: only used for role dev, venv never created on disk
53+
- /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/poetry.lock
54+
- /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/requirements.txt
55+
4256
- name: Get package facts
4357
package_facts:
4458

ansible/extras.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,23 @@
1+
- hosts: k3s_server:!builder
2+
become: yes
3+
tags: k3s
4+
tasks:
5+
- name: Start k3s server
6+
ansible.builtin.include_role:
7+
name: k3s
8+
tasks_from: server-runtime.yml
9+
10+
# technically should be part of bootstrap.yml but hangs waiting on failed mounts
11+
# if runs before filesystems.yml after the control node has been reimaged
12+
- hosts: k3s_agent:!builder
13+
become: yes
14+
tags: k3s
15+
tasks:
16+
- name: Start k3s agents
17+
ansible.builtin.include_role:
18+
name: k3s
19+
tasks_from: agent-runtime.yml
20+
121
- hosts: basic_users:!builder
222
become: yes
323
tags:

ansible/fatimage.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
- import_playbook: extras.yml
8080

8181
# TODO: is this the right place?
82-
- name: Install compute_init script
82+
- name: Install compute_init playbook
8383
hosts: compute_init
8484
tags: compute_init # tagged to allow running on cluster instances for dev
8585
become: yes
@@ -88,6 +88,15 @@
8888
name: compute_init
8989
tasks_from: install.yml
9090

91+
- name: Install gateway playbook
92+
hosts: gateway
93+
tags: gateway
94+
become: yes
95+
gather_facts: no
96+
tasks:
97+
- include_role:
98+
name: gateway
99+
91100
- hosts: builder
92101
become: yes
93102
gather_facts: yes

ansible/roles/basic_users/README.md

Lines changed: 96 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,58 @@
22
basic_users
33
===========
44

5-
Setup users on cluster nodes using `/etc/passwd` and manipulating `$HOME`, i.e. without requiring LDAP etc. Features:
5+
Setup users on cluster nodes using `/etc/passwd` and manipulating `$HOME`, i.e.
6+
without requiring LDAP etc. Features:
67
- UID/GID is consistent across cluster (and explicitly defined).
7-
- SSH key generated and propagated to all nodes to allow login between cluster nodes.
8+
- SSH key generated and propagated to all nodes to allow login between cluster
9+
nodes.
810
- An "external" SSH key can be added to allow login from elsewhere.
9-
- Login to the control node is prevented.
11+
- Login to the control node is prevented (by default).
1012
- When deleting users, systemd user sessions are terminated first.
1113

12-
Requirements
13-
------------
14-
- $HOME (for normal users, i.e. not `centos`) is assumed to be on a shared filesystem.
14+
> [!IMPORTANT] The defaults for this role assumes that `$HOME` for users
15+
managed by this role (e.g. not `rocky` and other system users) is on a shared
16+
filesystem. The export of this shared filesystem may be root squashed if its
17+
server is in the `basic_user` group - see configuration examples below.
1518

1619
Role Variables
1720
--------------
1821

19-
- `basic_users_users`: Optional, default empty list. A list of mappings defining information for each user. In general, mapping keys/values are passed through as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) and default values are as given there. However:
20-
- `create_home`, `generate_ssh_key` and `ssh_key_comment` are set automatically; this assumes home directories are on a cluster-shared filesystem.
21-
- `uid` should be set, so that the UID/GID is consistent across the cluster (which Slurm requires).
22-
- `shell` if *not* set will be `/sbin/nologin` on the `control` node and the default shell on other users. Explicitly setting this defines the shell for all nodes.
23-
- An additional key `public_key` may optionally be specified to define a key to log into the cluster.
24-
- An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated.
22+
- `basic_users_homedir_server`: Optional inventory hostname in the `basic_users`
23+
group defining the host to use to create home directories. If the home
24+
directory export is root squashed, this host *must* be the home directory
25+
server. Default is the `control` node which is appropriate for the default
26+
appliance configuration. Not relevant if `create_home` is false for all users.
27+
- `basic_users_homedir_server_path`: Optional path prefix for home directories on
28+
the `basic_users_homedir_server`, i.e. on the "server side". Default is
29+
`/exports/home` which is appropriate for the default appliance configuration.
30+
- `basic_users_homedir_client`: Optional inventory hostname in the `basic_users`
31+
group defining the host to use to create ssh keys etc in home directories.
32+
This should be a host mounting the home directories. Default is the first
33+
node in the `login` group which is appropriate for the default appliance
34+
configuration.
35+
- `basic_users_users`: Optional, default empty list. A list of mappings defining
36+
information for each user. In general, mapping keys/values are passed through
37+
as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html)
38+
and default values are as given there, with the following differences:
39+
- `generate_ssh_key`: Default is `true`, and the generated key is added to
40+
the user's authorized keys.
41+
- `ssh_key_comment`: Default is user name.
42+
- `home`: Set automatically based on the user name and
43+
`basic_users_homedir_server_path`. Can be overriden for users with
44+
non-standard home directory paths.
45+
- `uid`: Should be set, so that the UID/GID is consistent across the cluster
46+
(which Slurm requires).
47+
- `shell`: If *not* set will be `/sbin/nologin` on the `control` node to
48+
prevent users logging in to this node, and the default shell on other
49+
nodes. Explicitly setting this defines the shell for all nodes and if the
50+
shared home directories are mounted on the control node will allow the
51+
user to log in to the control node.
52+
- `public_key`: Optional, define a key to log into the cluster with.
53+
- `sudo`: Optional, a (possibly multiline) string defining sudo rules for the
54+
user.
55+
- `ssh_key_type` defaults to `ed25519` instead of the `ansible.builtin.user`
56+
default of `rsa`.
2557
- Any other keys may present for other purposes (i.e. not used by this role).
2658
- `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there.
2759
- `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run.
@@ -31,29 +63,67 @@ Dependencies
3163

3264
None.
3365

34-
Example Playbook
35-
----------------
66+
Example Configurations
67+
----------------------
3668

37-
```yaml
38-
- hosts: basic_users
39-
become: yes
40-
gather_facts: yes
41-
tasks:
42-
- import_role:
43-
name: basic_users
44-
```
45-
46-
Example variables, to create user `alice` and delete user `bob`:
69+
With default appliance NFS configuration, create user `alice` with access
70+
to all nodes except the control node, and delete user `bob`:
4771

4872
```yaml
4973
basic_users_users:
5074
- comment: Alice Aardvark
5175
name: alice
5276
uid: 2005
53-
public_key: ssh-rsa ...
77+
public_key: ssh-ed25519 ...
5478
- comment: Bob Badger
5579
name: bob
5680
uid: 2006
57-
public_key: ssh-rsa ...
81+
public_key: ssh-ed25519 ...
5882
state: absent
5983
```
84+
85+
Using an external share which:
86+
- does not root squash (so this role can create directories on it)
87+
- is mounted to all nodes including the control node (so this role can set
88+
authorized keys there)
89+
90+
Create user `Carol`:
91+
92+
```yaml
93+
basic_users_homedir_host: "{{ ansible_play_hosts | first }}" # doesn't matter which host is used
94+
basic_users_homedir_host_path: /home # homedir_host is client not server
95+
basic_users_user:
96+
- comment: Carol Crane
97+
name: carol
98+
uid: 2007
99+
public_key: ssh-ed25519 ...
100+
```
101+
102+
Using an external share which *does* root squash, so home directories cannot be
103+
created by this role and must already exist, create user `Dan`:
104+
105+
```yaml
106+
basic_users_homedir_host: "{{ ansible_play_hosts | first }}"
107+
basic_users_homedir_host_path: /home
108+
basic_users_users:
109+
- comment: Dan Deer
110+
create_home: false
111+
name: dan
112+
uuid: 2008
113+
public_key: ssh-ed25519 ...
114+
```
115+
116+
Using NFS exported from the control node, but mounted to all nodes (so that
117+
authorized keys applies to all nodes), create user `Erin` with passwordless sudo:
118+
119+
```yaml
120+
basic_users_users:
121+
- comment: Erin Eagle
122+
name: erin
123+
uid: 2009
124+
shell: /bin/bash # override default nologin on control
125+
groups:
126+
- adm # enables ssh to compute nodes even without a job running
127+
sudo: erin ALL=(ALL) NOPASSWD:ALL
128+
public_key: ssh-ed25519 ...
129+
```

ansible/roles/basic_users/defaults/main.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
basic_users_manage_homedir: "{{ (ansible_hostname == (ansible_play_hosts | first)) }}"
1+
basic_users_homedir_server: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server
2+
basic_users_homedir_server_path: /exports/home
3+
basic_users_homedir_client: "{{ groups['login'] | first }}"
24
basic_users_userdefaults:
3-
state: present
4-
create_home: "{{ basic_users_manage_homedir }}"
5-
generate_ssh_key: "{{ basic_users_manage_homedir }}"
5+
state: present # need this here so don't have to add default() everywhere
6+
generate_ssh_key: true
67
ssh_key_comment: "{{ item.name }}"
8+
ssh_key_type: ed25519
79
shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}"
810
basic_users_users: []
911
basic_users_groups: []

0 commit comments

Comments
 (0)