Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
054cb73
copy /etc/hosts to /exports/hosts/hosts
bertiethorpe Oct 29, 2024
36de796
add resolv_conf role to compute script
bertiethorpe Oct 31, 2024
c1065b3
add manila to compute script
bertiethorpe Nov 6, 2024
fce13ed
Compute script: configure EESSI
bertiethorpe Nov 6, 2024
36f1e17
testing openhpc in compute script
bertiethorpe Nov 7, 2024
8930d38
finish transferring openhpc tasks to compute script
bertiethorpe Nov 12, 2024
4e4f206
move manila share info mount to compute_init role
bertiethorpe Nov 12, 2024
fda2d31
fix mounts
bertiethorpe Nov 13, 2024
998ebf1
address review comments
bertiethorpe Nov 13, 2024
feab4cf
Merge branch 'main' into feat/compute-script
bertiethorpe Nov 15, 2024
1e1779c
Merge branch 'main' into feat/compute-script
bertiethorpe Nov 19, 2024
903e22c
Merge branch 'main' into feat/compute-script
bertiethorpe Nov 20, 2024
a32e309
remove gres.conf - no-op
bertiethorpe Nov 20, 2024
a1f71b6
remove or hardcode some vars, make resolv_conf block conditional
bertiethorpe Nov 20, 2024
61392ed
move EESSI CVMFS install and config to nfs export
bertiethorpe Nov 20, 2024
51b02d3
move manila mount share to nfs export
bertiethorpe Nov 20, 2024
134515d
Pause CI testing for branch feat/compute-script
bertiethorpe Nov 20, 2024
f66feb9
simplify slurm-init file injection loop
bertiethorpe Nov 27, 2024
4a3074b
prototype script - hostvars no-op
bertiethorpe Dec 13, 2024
e3ce492
use k3s_server metadata for server_ip
bertiethorpe Dec 13, 2024
07ed822
compute init node condition based off metadata
bertiethorpe Dec 13, 2024
a43a5f9
fail gracefully when NFS server not up
bertiethorpe Dec 13, 2024
76f292e
rejoin node to cluster
bertiethorpe Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ on:
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
branches:
- '!feat/compute-script'
jobs:
openstack:
name: openstack-ci
Expand Down
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ roles/*
!roles/squid/**
!roles/tuned/
!roles/tuned/**
!roles/compute_init/
!roles/compute_init/**
!roles/k3s/
!roles/k3s/**
!roles/k9s/
Expand Down
8 changes: 8 additions & 0 deletions ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@
- import_role:
name: persist_hostkeys

- name: Inject ansible-init compute script
hosts: compute_init
tags: compute_init
become: yes
tasks:
- import_role:
name: compute_init

- name: Install k9s
become: yes
hosts: k9s
Expand Down
271 changes: 271 additions & 0 deletions ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
---

- name: Compute node initialisation
hosts: localhost
become: yes
# VARS TO BE SUPPLIED VIA CLOUD INIT METADATA
vars:
control_node_ip: "172.16.1.228"
resolv_conf_nameservers: [1.1.1.1, 8.8.8.8]

nfs_export: "/exports/home"
nfs_client_mnt_options:
nfs_client_mnt_point: "/home"
nfs_client_mnt_state: mounted
nfs_server: "{{ control_node_ip }}"

os_manila_mount_state: mounted
os_manila_mount_opts:
- x-systemd.device-timeout=30
- x-systemd.mount-timeout=30
- noatime
- _netdev # prevents mount blocking early boot before networking available
- rw
os_manila_mount_ceph_conf_path: /etc/ceph

basic_users_manage_homedir: false
basic_users_userdefaults:
state: present
create_home: "{{ basic_users_manage_homedir }}"
generate_ssh_key: "{{ basic_users_manage_homedir }}"
ssh_key_comment: "{{ item.name }}"
test_user_password: "zXpcWyGQL7jtZnqylQra4g=="
basic_users_users:
- name: testuser # can't use rocky as $HOME isn't shared!
password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent
uid: 1005
basic_users_groups: []

openhpc_conf_server: "{{ control_node_ip }}"

tasks:
- name: Configure resolve.conf
block:
- name: Set nameservers in /etc/resolv.conf
ansible.builtin.template:
src: /etc/ansible-init/templates/resolv.conf.j2
dest: /etc/resolv.conf
owner: root
group: root
mode: u=rw,og=r

- name: Disable NetworkManager control of resolv.conf
ansible.builtin.copy:
src: /etc/ansible-init/files/NetworkManager-dns-none.conf
dest: /etc/NetworkManager/conf.d/90-dns-none.conf
owner: root
group: root
mode: u=rw,og=r
register: _copy_nm_config

- name: Reload NetworkManager
ansible.builtin.systemd:
name: NetworkManager
state: reloaded
when: _copy_nm_config.changed | default(false)
when: resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0


- name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts
block:
- name: Ensure the mount directory exists
file:
path: /mnt/cluster
state: directory
mode: 0755

- name: Mount /mnt/cluster
mount:
path: /mnt/cluster
src: "{{ vars.control_node_ip }}:/exports/cluster"
fstype: nfs
opts: rw,sync
state: mounted

- name: Copy /mnt/cluster/hosts contents to /etc/hosts
copy:
src: /mnt/cluster/hosts
dest: /etc/hosts
owner: root
group: root
mode: 0644


- name: NFS client mount
block:
- name: ensure mount directory exists
file:
path: "{{ nfs_client_mnt_point }}"
state: directory

- name: mount the filesystem
mount:
path: "{{ nfs_client_mnt_point }}"
src: "{{ nfs_server }}:{{ nfs_export }}"
fstype: nfs
state: "{{ nfs_client_mnt_state }}"


- name: Manila mount
block:
- name: Read manila share info from nfs file
slurp:
src: "/mnt/cluster/manila_share_info.yml"
register: manila_share_info_file
no_log: true

- name: Parse and set fact for manila share info
set_fact:
os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}"

- name: Read manila shares from nfs file
slurp:
src: "/mnt/cluster/manila_shares.yml"
register: manila_shares_file

- name: Parse and set fact for manila shares
set_fact:
os_manila_mount_shares: "{{ manila_shares_file.content | b64decode | from_yaml }}"

- name: Ensure Ceph configuration directory exists
ansible.builtin.file:
path: "{{ os_manila_mount_ceph_conf_path }}"
state: directory
mode: "0755"
owner: root
group: root

- name: Configure ceph.conf using os_manila_mount_host
ansible.builtin.template:
src: /etc/ansible-init/templates/ceph.conf.j2
dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf"
owner: root
group: root
mode: "0600"

- name: Ensure mount directory exists
ansible.builtin.file:
path: "{{ item.mount_path }}"
state: directory
owner: "{{ item.mount_user | default(omit) }}"
group: "{{ item.mount_group | default(omit) }}"
mode: "{{ item.mount_mode | default(omit) }}"
loop: "{{ os_manila_mount_shares }}"
loop_control:
label: "{{ item.share_name }}"

- name: Write Ceph client keyring
ansible.builtin.template:
src: /etc/ansible-init/templates/ceph.keyring.j2
dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring"
mode: "0600"
owner: root
group: root
loop: "{{ os_manila_mount_share_info }}"
loop_control:
label: "{{ item.share_name }}"

- name: Mount the Ceph share
ansible.posix.mount:
path: "{{ item[0].mount_path }}"
src: "{{ item[1].host }}:{{ item[1].export }}"
fstype: ceph
opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}"
# NB share_user is looked up here in case of autodetection
state: "{{ item[0].mount_state | default(os_manila_mount_state) }}"
loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}"
loop_control:
label: "{{ item[0].share_name }}"

- name: Ensure mounted directory has correct permissions
ansible.builtin.file:
path: "{{ item.mount_path }}"
state: directory
owner: "{{ item.mount_user | default(omit) }}"
group: "{{ item.mount_group | default(omit) }}"
mode: "{{ item.mount_mode | default(omit) }}"
loop: "{{ os_manila_mount_shares }}"
loop_control:
label: "{{ item.share_name }}"
when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral']


- name: Basic users setup
block:
- name: Create groups
ansible.builtin.group: "{{ item }}"
loop: "{{ basic_users_groups }}"

- name: Create users
user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}"
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }} [{{ item.state | default('present') }}]"
register: basic_users_info

- name: Write sudo rules
blockinfile:
path: /etc/sudoers.d/80-{{ item.name}}-user
block: "{{ item.sudo }}"
create: true
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }}"
when: "'sudo' in item"


- name: Configure EESSI
block:
- name: Copy /mnt/cluster/cvmfs/default.local contents to /etc/cvmfs/default.local
copy:
src: /mnt/cluster/cvmfs/default.local
dest: /etc/cvmfs/default.local
owner: root
group: root
mode: 0644

# NOTE: Not clear how to make this idempotent
- name: Ensure CVMFS config is setup
command:
cmd: "cvmfs_config setup"


- name: Configure openhpc
block:
- name: Fix permissions on /etc to pass Munge startup checks
# Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root)
# which fails munged startup checks
file:
path: /etc
state: directory
mode: g-w

- name: Copy Munge key from NFS-mounted directory to /etc/munge
copy:
src: "/mnt/cluster/openhpc_munge.key"
dest: "/etc/munge/munge.key"
owner: munge
group: munge
mode: 0400

- name: Set slurmctld location for configless operation
lineinfile:
path: /etc/sysconfig/slurmd
line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'"
regexp: "^SLURMD_OPTIONS="
create: yes
owner: root
group: root
mode: 0644

- name: Configure Munge service
service:
name: munge
enabled: true
state: started

- name: Ensure slurmd state
service:
name: slurmd
enabled: true
state: started
Loading