diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..848517bb8 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,6 +24,8 @@ on: - '!.gitignore' - '!.github/workflows/' - '.github/workflows/stackhpc' + branches: + - '!feat/compute-script' jobs: openstack: name: openstack-ci diff --git a/ansible/.gitignore b/ansible/.gitignore index 8edcc4360..02fb437f3 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,6 +58,8 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/compute_init/ +!roles/compute_init/** !roles/k3s/ !roles/k3s/** !roles/k9s/ diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..4cbe931b1 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,6 +37,14 @@ - import_role: name: persist_hostkeys +- name: Inject ansible-init compute script + hosts: compute_init + tags: compute_init + become: yes + tasks: + - import_role: + name: compute_init + - name: Install k9s become: yes hosts: k9s diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml new file mode 100644 index 000000000..165700668 --- /dev/null +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -0,0 +1,327 @@ +--- + +- name: Compute node initialisation + hosts: localhost + become: yes + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + iam_slurm_compute: "{{ os_metadata.meta.slurm_compute | default(false) }}" + server_node_ip: "{{ os_metadata.meta.k3s_server }}" + + resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] + + nfs_configurations: + - nfs_export: "/exports/home" + nfs_client_mnt_options: + nfs_client_mnt_point: "/home" + nfs_client_mnt_state: mounted + nfs_server: "{{ server_node_ip }}" + + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + os_manila_mount_ceph_conf_path: /etc/ceph + + basic_users_manage_homedir: false + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + test_user_password: "zXpcWyGQL7jtZnqylQra4g==" + basic_users_users: + - name: testuser # can't use rocky as $HOME isn't shared! + password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + uid: 1005 + basic_users_groups: [] + + openhpc_conf_server: "{{ server_node_ip }}" + + tasks: + - name: Skip initialization if slurm_compute metadata set to false + debug: + msg: "Skipping compute initialization" + when: not iam_slurm_compute | bool + + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: /etc/ansible-init/templates/resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: /etc/ansible-init/files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + when: + - resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 + - iam_slurm_compute | bool + + + - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts + block: + - name: Ensure the mount directory exists + file: + path: /mnt/cluster + state: directory + mode: 0755 + + - name: Mount /mnt/cluster + mount: + path: /mnt/cluster + src: "{{ server_node_ip }}:/exports/cluster" + fstype: nfs + opts: rw,sync + state: mounted + register: nfs_mount_result + ignore_errors: true + + - name: Fail gracefully if NFS mount is not available + debug: + msg: "NFS mount failed. Skipping compute initialization. Re-image if this persists." + when: nfs_mount_result.failed + + - name: Copy /mnt/cluster/hosts contents to /etc/hosts + copy: + src: /mnt/cluster/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 + when: not nfs_mount_result.failed + when: iam_slurm_compute | bool + + + - name: Include hostvars from NFS share + block: + - name: Extract short hostname using a shell block + shell: | + HOSTNAME=$(hostname) + echo "${HOSTNAME%.test.invalid}" + register: short_hostname + + # - name: Include vars from NFS mount + # include_vars: + # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed + + + - name: NFS client mount + block: + - name: ensure mount directory exists + file: + path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" + state: directory + loop: "{{ nfs_configurations }}" + + - name: mount the filesystem + mount: + path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" + src: "{{ item.get('nfs_server', nfs_server) }}:{{ item.get('nfs_export', nfs_export) }}" + opts: "{{ item['nfs_client_mnt_options'] | default(nfs_client_mnt_options, true) | default(omit, true) }}" # for some reason items.get() here fails with "an incorrect mount option was specified" + fstype: nfs + state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" + loop: "{{ nfs_configurations }}" + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed + + + - name: Manila mount + block: + - name: Read manila share info from nfs file + slurp: + src: "/mnt/cluster/manila_share_info.yml" + register: manila_share_info_file + no_log: true + + - name: Parse and set fact for manila share info + set_fact: + os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" + + - name: Read manila shares from nfs file + slurp: + src: "/mnt/cluster/manila_shares.yml" + register: manila_shares_file + + - name: Parse and set fact for manila shares + set_fact: + os_manila_mount_shares: "{{ manila_shares_file.content | b64decode | from_yaml }}" + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: /etc/ansible-init/templates/ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: /etc/ansible-init/templates/ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed + + + - name: Basic users setup + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed + + + - name: Configure EESSI + block: + - name: Copy /mnt/cluster/cvmfs/default.local contents to /etc/cvmfs/default.local + copy: + src: /mnt/cluster/cvmfs/default.local + dest: /etc/cvmfs/default.local + owner: root + group: root + mode: 0644 + + # NOTE: Not clear how to make this idempotent + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed + + + - name: Configure openhpc + block: + - name: Fix permissions on /etc to pass Munge startup checks + # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) + # which fails munged startup checks + file: + path: /etc + state: directory + mode: g-w + + - name: Copy Munge key from NFS-mounted directory to /etc/munge + copy: + src: "/mnt/cluster/openhpc_munge.key" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + + - name: Configure Munge service + service: + name: munge + enabled: true + state: started + + - name: Ensure slurmd state + service: + name: slurmd + enabled: true + state: started + + - name: Ensure node is in cluster + command: scontrol update state=resume nodename={{ short_hostname.stdout }} + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml new file mode 100644 index 000000000..f5513a80a --- /dev/null +++ b/ansible/roles/compute_init/tasks/main.yml @@ -0,0 +1,150 @@ +--- + +- name: Ensure directories exist + file: + path: "/etc/ansible-init/{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - templates + - files + - library + - filter_plugins + +- name: Inject templates + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/templates/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/templates/resolv.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/files/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/files/NetworkManager-dns-none.conf + +- name: Inject libraries + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/library/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/library/terminate_user_sessions.py + - ../../stackhpc.os-manila-mount/library/os_manila_share.py + - ../../stackhpc.openhpc/library/sacct_cluster.py + +- name: Inject filter_plugins + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/filter_plugins/filter_keys.py + - ../../stackhpc.openhpc/filter_plugins/slurm_conf.py + +- name: Add filter_plugins ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + +- name: Ensure nfs /exports/cluster configured + block: + - name: Ensure the /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0755 + + - name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: 0644 + remote_src: true + + - name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + when: os_manila_mount_share_info is defined + + - name: Copy manila mount shares to /exports/cluster + copy: + content: "{{ os_manila_mount_shares | to_nice_yaml }}" + dest: "/exports/cluster/manila_shares.yml" + when: os_manila_mount_shares is defined + + - name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + + - name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + + - name: Write openhpc munge key + copy: + content: "{{ vault_openhpc_mungekey | b64decode }}" + dest: "/exports/cluster/openhpc_munge.key" + owner: munge + group: munge + mode: 0400 + + # - name: Ensure /exports/cluster/inventory_hostname directory exists + # file: + # path: /exports/cluster/{{ inventory_hostname }} + # state: directory + # owner: root + # group: root + # mode: 0755 + + # - name: Template hostvars + # template: + # src: ../templates/hostvars.j2 + # dest: "/exports/cluster/{{ inventory_hostname }}/hostvars.yml" + # owner: root + # group: root + # mode: 0644 + + delegate_to: "{{ groups['control'] | first }}" + +- name: Inject compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/compute-init.yml + owner: root + group: root + mode: 0644 \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index bd340b190..84371c99a 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -15,3 +15,9 @@ nfs_configurations: nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" + + - comment: Export /exports/cluster from Slurm control node + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: false + nfs_export: "/exports/cluster" # control node has to copy in /etc/hosts to here diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9b9aa5bf0..ba846777c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -136,6 +136,9 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init +[compute_init] +# Hosts to deploy compute initialisation ansible-init script to. + [k3s] # Hosts to run k3s server/agent diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index ba5cbc08d..5ada017e1 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -82,6 +82,10 @@ openhpc # Hosts to run ansible-init cluster +[compute_init:children] +# Hosts to deploy compute initialisation ansible-init script to. +cluster + [k3s:children] # Hosts to run k3s server/agent openhpc