Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ansible/roles/cacerts/tasks/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
copy:
src: "{{ item }}"
dest: /exports/cluster/cacerts/
owner: root
owner: slurm
group: root
mode: 0644
with_fileglob:
Expand Down
64 changes: 43 additions & 21 deletions ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
tuned_started: true

nfs_client_mnt_point: "/mnt"
nfs_client_mnt_options:
nfs_client_mnt_options: "defaults,nosuid,nodev"
nfs_client_mnt_state: mounted
nfs_configurations:
nfs_enable:
Expand All @@ -47,15 +47,15 @@
- noatime
- _netdev # prevents mount blocking early boot before networking available
- rw
- nodev
- nosuid

basic_users_groups: []
basic_users_manage_homedir: false # homedir must already exist on shared filesystem
basic_users_userdefaults:
state: present
create_home: "{{ basic_users_manage_homedir }}"
generate_ssh_key: "{{ basic_users_manage_homedir }}"
generate_ssh_key: true
ssh_key_comment: "{{ item.name }}"
basic_users_users: []
basic_users_groups: []

tasks:
- block:
Expand Down Expand Up @@ -96,6 +96,7 @@
when: _mount_mnt_cluster.failed

- name: Check if hostvars exist
become_user: slurm
stat:
path: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml"
register: hostvars_stat
Expand All @@ -109,17 +110,33 @@
- meta: end_play
when: not hostvars_stat.stat.exists

- name: Load hostvars from NFS
- name: Sync /mnt/cluster to /tmp
become_user: slurm
synchronize:
src: "/mnt/cluster/"
dest: "/tmp/cluster/"
archive: yes
recursive: yes

- name: Unmount /mnt/cluster after sync
mount:
path: /mnt/cluster
state: unmounted

- name: Load hostvars
# this is higher priority than vars block = normal ansible's hostvars
include_vars:
file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname

# TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups?
file: "/tmp/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml"

- name: Run chrony role
ansible.builtin.include_role:
name: mrlesmithjr.chrony
when: enable_chrony | bool
tasks_from: config_chrony.yml
vars:
# workaround for set_facts.yml:
chrony_config: /etc/chrony.conf
chrony_service: chronyd
when: enable_chrony

- name: Configure resolve.conf
block:
Expand Down Expand Up @@ -149,7 +166,7 @@

- name: Copy cluster /etc/hosts
copy:
src: /mnt/cluster/hosts
src: /tmp/cluster/hosts
dest: /etc/hosts
owner: root
group: root
Expand All @@ -160,14 +177,14 @@
ansible.builtin.include_role:
name: cacerts
vars:
cacerts_cert_dir: "/mnt/cluster/cacerts"
cacerts_cert_dir: "/tmp/cluster/cacerts"
when: enable_cacerts

- name: Configure sshd
ansible.builtin.include_role:
name: sshd
vars:
sshd_conf_src: "/mnt/cluster/hostconfig/{{ ansible_hostname }}/sshd.conf"
sshd_conf_src: "/tmp/cluster/hostconfig/{{ ansible_hostname }}/sshd.conf"
when: enable_sshd

- name: Configure tuned
Expand All @@ -179,7 +196,7 @@
name: sssd
tasks_from: configure.yml
vars:
sssd_conf_src: "/mnt/cluster/hostconfig/{{ ansible_hostname }}/sssd.conf"
sssd_conf_src: "/tmp/cluster/hostconfig/{{ ansible_hostname }}/sssd.conf"
when: enable_sssd

# NFS client mount
Expand All @@ -194,7 +211,7 @@
block:
- name: Read manila share info from nfs file
include_vars:
file: /mnt/cluster/manila_share_info.yml
file: /tmp/cluster/manila_share_info.yml
no_log: true # contains secrets

- name: Ensure Ceph configuration directory exists
Expand Down Expand Up @@ -275,28 +292,33 @@
loop: "{{ basic_users_groups }}"

- name: Create users
user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}"
user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}"
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }} [{{ item.state | default('present') }}]"
register: basic_users_info
label: "{{ item.name }}"
vars:
_disable_homedir: # ensure this task doesn't touch $HOME
create_home: false
generate_ssh_key: false

- name: Write sudo rules
blockinfile:
path: /etc/sudoers.d/80-{{ item.name}}-user
path: /etc/sudoers.d/80-{{ item.name }}-user
block: "{{ item.sudo }}"
create: true
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }}"
when: "'sudo' in item"
when:
- item.state | default('present') == 'present'
- "'sudo' in item"
when: enable_basic_users

- name: EESSI
block:
- name: Copy cvmfs config
copy:
src: /mnt/cluster/cvmfs/default.local
src: /tmp/cluster/cvmfs/default.local
dest: /etc/cvmfs/default.local
owner: root
group: root
Expand Down
12 changes: 6 additions & 6 deletions ansible/roles/compute_init/tasks/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
copy:
src: /etc/hosts
dest: /exports/cluster/hosts
owner: root
owner: slurm
group: root
mode: u=rw,go=
remote_src: true
Expand Down Expand Up @@ -41,7 +41,7 @@
copy:
content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}"
dest: /exports/cluster/manila_share_info.yml
owner: root
owner: slurm
group: root
mode: u=rw,g=r
run_once: true
Expand All @@ -55,7 +55,7 @@
file:
path: /exports/cluster/cvmfs
state: directory
owner: root
owner: slurm
group: root
mode: 0755
run_once: true
Expand All @@ -65,7 +65,7 @@
copy:
src: /etc/cvmfs/default.local
dest: /exports/cluster/cvmfs/default.local
owner: root
owner: slurm
group: root
mode: 0644
remote_src: true
Expand All @@ -82,9 +82,9 @@
file:
path: "/exports/cluster/hostconfig/{{ inventory_hostname }}/"
state: directory
owner: root
owner: slurm
group: root
mode: u=rw,go=
mode: u=rX,g=rwX,o=
delegate_to: "{{ groups['control'] | first }}"

- name: Template sssd config
Expand Down
17 changes: 0 additions & 17 deletions environments/.stackhpc/inventory/group_vars/all/nfs.yml

This file was deleted.

4 changes: 2 additions & 2 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-250312-1522-7e5c051d",
"RL9": "openhpc-RL9-250312-1435-7e5c051d"
"RL8": "openhpc-RL8-250317-1544-4a641ff2",
"RL9": "openhpc-RL9-250317-1545-4a641ff2"
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ansible_init_wait: 1200 # seconds
ansible_init_wait: 300 # seconds

ansible_init_pip_packages:
# role defaults:
Expand Down
10 changes: 6 additions & 4 deletions environments/common/inventory/group_vars/all/nfs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ nfs_configurations:
# NB: this is stackhpc.nfs role defaults but are set here to prevent being
# accidently overriden via default options
nfs_export_options: 'rw,secure,root_squash'
# prevent non-cluster IPs mounting the share:
# NB: this is set as default for all shares above but is repeated here
# in case nfs_export_clients is overriden
nfs_export_clients: "{{ _nfs_node_ips }}"

- comment: Export /exports/cluster from Slurm control node
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: false
nfs_export: "/exports/cluster"