Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
054cb73
copy /etc/hosts to /exports/hosts/hosts
bertiethorpe Oct 29, 2024
36de796
add resolv_conf role to compute script
bertiethorpe Oct 31, 2024
c1065b3
add manila to compute script
bertiethorpe Nov 6, 2024
fce13ed
Compute script: configure EESSI
bertiethorpe Nov 6, 2024
36f1e17
testing openhpc in compute script
bertiethorpe Nov 7, 2024
8930d38
finish transferring openhpc tasks to compute script
bertiethorpe Nov 12, 2024
4e4f206
move manila share info mount to compute_init role
bertiethorpe Nov 12, 2024
fda2d31
fix mounts
bertiethorpe Nov 13, 2024
998ebf1
address review comments
bertiethorpe Nov 13, 2024
feab4cf
Merge branch 'main' into feat/compute-script
bertiethorpe Nov 15, 2024
1e1779c
Merge branch 'main' into feat/compute-script
bertiethorpe Nov 19, 2024
903e22c
Merge branch 'main' into feat/compute-script
bertiethorpe Nov 20, 2024
a32e309
remove gres.conf - no-op
bertiethorpe Nov 20, 2024
a1f71b6
remove or hardcode some vars, make resolv_conf block conditional
bertiethorpe Nov 20, 2024
61392ed
move EESSI CVMFS install and config to nfs export
bertiethorpe Nov 20, 2024
51b02d3
move manila mount share to nfs export
bertiethorpe Nov 20, 2024
134515d
Pause CI testing for branch feat/compute-script
bertiethorpe Nov 20, 2024
f66feb9
simplify slurm-init file injection loop
bertiethorpe Nov 27, 2024
4a3074b
prototype script - hostvars no-op
bertiethorpe Dec 13, 2024
e3ce492
use k3s_server metadata for server_ip
bertiethorpe Dec 13, 2024
07ed822
compute init node condition based off metadata
bertiethorpe Dec 13, 2024
a43a5f9
fail gracefully when NFS server not up
bertiethorpe Dec 13, 2024
76f292e
rejoin node to cluster
bertiethorpe Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,6 @@ roles/*
!roles/squid/**
!roles/tuned/
!roles/tuned/**
!roles/compute_init/
!roles/compute_init/**

8 changes: 8 additions & 0 deletions ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,11 @@
tasks:
- import_role:
name: persist_hostkeys

- name: Inject ansible-init compute script
hosts: compute_init
tags: compute_init
become: yes
tasks:
- import_role:
name: compute_init
19 changes: 19 additions & 0 deletions ansible/filesystems.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,22 @@
tasks:
- include_role:
name: stackhpc.os-manila-mount

- name: Manage /exports/cluster and Manila share info
hosts: control
become: true
tasks:
- block:
- name: Ensure /exports/cluster directory exists
file:
path: /exports/cluster
state: directory
owner: root
group: root
mode: 0755

- name: Copy manila share info to /exports/cluster
copy:
content: "{{ os_manila_mount_share_info | to_nice_yaml }}"
dest: "/exports/cluster/manila_share_info.yml"
when: os_manila_mount_share_info is defined
333 changes: 333 additions & 0 deletions ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
---

- name: Compute node initialisation
hosts: localhost
become: yes
# VARS TO BE SUPPLIED VIA CLOUD INIT METADATA
vars:
control_node_ip: "172.16.1.228"
nfs_export_hosts: "/exports/hosts"
nfs_export_cluster: "/exports/cluster"
resolv_conf_nameservers: [1.1.1.1, 8.8.8.8]


nfs_disk_location:
nfs_export: "/exports/home"
nfs_client_mnt_options:
nfs_client_mnt_point: "/home"
nfs_client_mnt_state: mounted
nfs_server: "{{ control_node_ip }}"


os_manila_mount_shares: []
os_manila_mount_state: mounted
os_manila_mount_opts:
- x-systemd.device-timeout=30
- x-systemd.mount-timeout=30
- noatime
- _netdev # prevents mount blocking early boot before networking available
- rw
os_manila_mount_share_info: [] # populated by lookup mode
os_manila_mount_ceph_conf_path: /etc/ceph


basic_users_manage_homedir: false
basic_users_userdefaults:
state: present
create_home: "{{ basic_users_manage_homedir }}"
generate_ssh_key: "{{ basic_users_manage_homedir }}"
ssh_key_comment: "{{ item.name }}"
test_user_password: "zXpcWyGQL7jtZnqylQra4g=="
basic_users_users:
- name: testuser # can't use rocky as $HOME isn't shared!
password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent
uid: 1005
state: present
basic_users_groups: []


cvmfs_quota_limit_mb: 10000
cvmfs_config_default:
CVMFS_CLIENT_PROFILE: single
CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}"
cvmfs_config_overrides: {}
cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}"

openhpc_conf_server: control_node_ip
openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2
openhpc_slurm_service_enabled: true
openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}"
openhpc_enable:
control: false
batch: true
database: false
runtime: true

tasks:
- name: Configure resolve.conf
block:
- name: Set nameservers in /etc/resolv.conf
ansible.builtin.template:
src: /etc/ansible-init/templates/resolv.conf.j2
dest: /etc/resolv.conf
owner: root
group: root
mode: u=rw,og=r

- name: Disable NetworkManager control of resolv.conf
ansible.builtin.copy:
src: /etc/ansible-init/files/NetworkManager-dns-none.conf
dest: /etc/NetworkManager/conf.d/90-dns-none.conf
owner: root
group: root
mode: u=rw,og=r
register: _copy_nm_config

- name: Reload NetworkManager
ansible.builtin.systemd:
name: NetworkManager
state: reloaded
when: _copy_nm_config.changed | default(false)


- name: Mount /etc/hosts on compute nodes
block:
- name: Ensure the mount directory exists
file:
path: /mnt/hosts
state: directory
mode: 0755

- name: Mount /mnt/hosts
mount:
path: /mnt/hosts
src: "{{ vars.control_node_ip }}:{{ nfs_export_hosts }}"
fstype: nfs
opts: rw,sync
state: mounted

- name: Copy /mnt/hosts/hosts contents to /etc/hosts
copy:
src: /mnt/hosts/hosts
dest: /etc/hosts
owner: root
group: root
mode: 0644


- name: NFS client mount
block:
- name: ensure mount directory exists
file:
path: "{{ nfs_client_mnt_point }}"
state: directory

- name: mount the filesystem
mount:
path: "{{ nfs_client_mnt_point }}"
src: "{{ nfs_server }}:{{ nfs_export }}"
fstype: nfs
state: "{{ nfs_client_mnt_state }}"

- name: Ensure the mount directory exists
file:
path: /mnt/
state: directory
mode: 0755

- name: Mount /mnt/
mount:
path: /mnt/
src: "{{ vars.control_node_ip }}:{{ nfs_export_cluster }}"
fstype: nfs
opts: rw,sync
state: mounted


- name: Manila mount
block:
- name: Read manila share from nfs file
slurp:
src: "/mnt/manila_share_info.yml"
register: manila_share_info_file

- name: Parse and set fact for manila share info
set_fact:
os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}"

- name: Ensure Ceph configuration directory exists
ansible.builtin.file:
path: "{{ os_manila_mount_ceph_conf_path }}"
state: directory
mode: "0755"
owner: root
group: root

- name: Configure ceph.conf using os_manila_mount_host
ansible.builtin.template:
src: /etc/ansible-init/templates/ceph.conf.j2
dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf"
owner: root
group: root
mode: "0600"

- name: Ensure mount directory exists
ansible.builtin.file:
path: "{{ item.mount_path }}"
state: directory
owner: "{{ item.mount_user | default(omit) }}"
group: "{{ item.mount_group | default(omit) }}"
mode: "{{ item.mount_mode | default(omit) }}"
loop: "{{ os_manila_mount_shares }}"
loop_control:
label: "{{ item.share_name }}"

- name: Write Ceph client keyring
ansible.builtin.template:
src: /etc/ansible-init/templates/ceph.keyring.j2
dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring"
mode: "0600"
owner: root
group: root
loop: "{{ os_manila_mount_share_info }}"
loop_control:
label: "{{ item.share_name }}"

- name: Mount the Ceph share
ansible.posix.mount:
path: "{{ item[0].mount_path }}"
src: "{{ item[1].host }}:{{ item[1].export }}"
fstype: ceph
opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}"
# NB share_user is looked up here in case of autodetection
state: "{{ item[0].mount_state | default(os_manila_mount_state) }}"
loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}"
loop_control:
label: "{{ item[0].share_name }}"

- name: Ensure mounted directory has correct permissions
ansible.builtin.file:
path: "{{ item.mount_path }}"
state: directory
owner: "{{ item.mount_user | default(omit) }}"
group: "{{ item.mount_group | default(omit) }}"
mode: "{{ item.mount_mode | default(omit) }}"
loop: "{{ os_manila_mount_shares }}"
loop_control:
label: "{{ item.share_name }}"
when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral']


- name: Basic users setup
block:
- name: Create groups
ansible.builtin.group: "{{ item }}"
loop: "{{ basic_users_groups }}"

- name: Create users
user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}"
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }} [{{ item.state | default('present') }}]"
register: basic_users_info

- name: Write sudo rules
blockinfile:
path: /etc/sudoers.d/80-{{ item.name}}-user
block: "{{ item.sudo }}"
create: true
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }}"
when: "'sudo' in item"


- name: Configure EESSI
block:
- name: Download Cern GPG key
ansible.builtin.get_url:
url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM
dest: ./cvmfs-key.gpg

- name: Import downloaded GPG key
command: rpm --import cvmfs-key.gpg

- name: Add CVMFS repo
dnf:
name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm

- name: Install CVMFS
dnf:
name: cvmfs

- name: Install EESSI CVMFS config
dnf:
name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm
# NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok?
disable_gpg_check: true

- name: Add base CVMFS config
community.general.ini_file:
dest: /etc/cvmfs/default.local
section: null
option: "{{ item.key }}"
value: "{{ item.value }}"
no_extra_spaces: true
loop: "{{ cvmfs_config | dict2items }}"

# NOTE: Not clear how to make this idempotent
- name: Ensure CVMFS config is setup
command:
cmd: "cvmfs_config setup"


- name: Configure openhpc
block:
- name: Fix permissions on /etc to pass Munge startup checks
# Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root)
# which fails munged startup checks
file:
path: /etc
state: directory
mode: g-w

- name: Copy Munge key from NFS-mounted directory to /etc/munge
copy:
src: "/mnt/openhpc_munge.key"
dest: "/etc/munge/munge.key"
owner: munge
group: munge
mode: 0400

- name: Create gres.conf
template:
src: "{{ openhpc_gres_template }}"
dest: /etc/slurm/gres.conf
mode: "0600"
owner: slurm
group: slurm
when: openhpc_enable.control | default(false)
register: ohpc_gres_conf

- name: Set slurmctld location for configless operation
lineinfile:
path: /etc/sysconfig/slurmd
line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'"
regexp: "^SLURMD_OPTIONS="
create: yes
owner: root
group: root
mode: 0644

- name: Configure Munge service
service:
name: munge
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"

- name: Ensure slurmd state
service:
name: slurmd
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
when: openhpc_enable.batch | default(false) | bool
Loading