diff --git a/ansible/.gitignore b/ansible/.gitignore index c1ec5de80..6ae64c72e 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -94,3 +94,5 @@ roles/* !roles/slurm_recompile/** !roles/nhc/ !roles/nhc/** +!roles/eessi/ +!roles/eessi/** diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 30a8abafa..50d024676 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -186,8 +186,9 @@ become: yes tasks: - name: Install and configure tuneD - import_role: + include_role: name: tuned + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - hosts: freeipa_server # Done here as it might be providing DNS @@ -217,31 +218,27 @@ become: yes tags: firewalld tasks: - - import_role: + - include_role: name: firewalld + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - hosts: fail2ban gather_facts: false become: yes tags: fail2ban tasks: - - import_role: + - include_role: name: fail2ban + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Setup podman gather_facts: false hosts: podman tags: podman tasks: - - import_role: - name: podman - tasks_from: prereqs.yml - tags: prereqs - - - import_role: + - include_role: name: podman - tasks_from: config.yml - tags: config + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - hosts: update gather_facts: false @@ -317,8 +314,10 @@ become: yes tags: linux_ansible_init tasks: - - include_role: + - name: Install ansible-init + include_role: name: azimuth_cloud.image_utils.linux_ansible_init + when: "appliances_mode == 'build'" - hosts: k3s:&builder become: yes diff --git a/ansible/extras.yml b/ansible/extras.yml index 54168e97d..08892e4ec 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -34,9 +34,10 @@ become: true gather_facts: false tasks: - - name: Install and configure EESSI - import_role: + - name: Install / configure EESSI + include_role: name: eessi + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Setup CUDA hosts: cuda diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 21a4d4126..ded3de31f 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -108,7 +108,12 @@ tasks_from: install.yml when: "'mysql' in group_names" - - name: OpenHPC + - name: Install rebuild + include_role: + name: rebuild + tasks_from: install.yml + + - name: Install OpenHPC import_role: name: stackhpc.openhpc tasks_from: install.yml @@ -134,7 +139,6 @@ import_role: name: openondemand tasks_from: vnc_compute.yml - when: "'openondemand_desktop' in group_names" - name: Open Ondemand jupyter node @@ -153,7 +157,11 @@ tasks_from: install.yml when: "'opensearch' in group_names" - # slurm_stats - nothing to do + - import_role: + name: slurm_stats + tasks_from: install.yml + when: "'slurm_stats' in group_names" + - import_role: name: filebeat tasks_from: install.yml @@ -171,11 +179,9 @@ when: "'openondemand' in group_names" - name: slurm exporter - import_role: + include_role: name: slurm_exporter - tasks_from: install - vars: - slurm_exporter_state: stopped + tasks_from: install.yml when: "'slurm_exporter' in group_names" - name: Install alertmanager @@ -249,6 +255,11 @@ - import_role: name: cloudalchemy.grafana tasks_from: install.yml + - import_role: + name: cloudalchemy.grafana + tasks_from: plugins.yml + - include_role: # done in same play so it can use handlers from cloudalchemy.grafana + name: grafana-dashboards - name: Add support for NVIDIA GPU auto detection to Slurm hosts: cuda diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index 4665c0f8f..41a685d20 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -24,6 +24,8 @@ tasks: - include_role: name: stackhpc.os-manila-mount + tasks_from: "{{ item }}" + loop: "{{ ['lookup.yml', 'mount.yml'] if appliances_mode == 'configure' else ['main.yml'] }}" - name: Setup Lustre clients hosts: lustre diff --git a/ansible/final.yml b/ansible/final.yml index cd9b211e2..3e715dfa0 100644 --- a/ansible/final.yml +++ b/ansible/final.yml @@ -17,3 +17,14 @@ - include_role: name: compute_init tasks_from: export.yml + +- hosts: proxy + gather_facts: false + tags: proxy + become: yes + tasks: + - include_role: + name: proxy + vars: + proxy_state: absent + when: proxy_remove | default(false) | bool == true diff --git a/ansible/iam.yml b/ansible/iam.yml index 857b8f840..8b3bf6bff 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -20,9 +20,10 @@ become: yes tasks: - name: Install FreeIPA client - import_role: + include_role: name: freeipa tasks_from: client-install.yml + when: "appliances_mode != 'configure'" - name: Enrol FreeIPA client import_role: name: freeipa diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index e97946212..d34a65f9d 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -20,19 +20,22 @@ tasks: - include_role: name: slurm_stats + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Deploy filebeat hosts: filebeat tags: filebeat tasks: - - import_role: + - include_role: name: filebeat + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Deploy node_exporter hosts: node_exporter tags: node_exporter tasks: - - import_role: name=cloudalchemy.node_exporter + - import_role: + name: cloudalchemy.node_exporter - name: Deploy OpenOndemand exporter hosts: openondemand @@ -46,12 +49,13 @@ tasks_from: exporter.yml - name: Deploy Slurm exporter - hosts: control + hosts: slurm_exporter become: true tags: slurm_exporter tasks: - - import_role: + - include_role: name: slurm_exporter + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Setup core monitoring software hosts: prometheus @@ -68,7 +72,7 @@ # i.e. if prometheus_version isn't defined we don't care, so use what's already there set_fact: prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" - when: "{{ (prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined] }}" + when: "(prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined]" - import_role: name: cloudalchemy.prometheus @@ -76,16 +80,28 @@ hosts: grafana tags: grafana tasks: - - assert: - that: vault_grafana_admin_password is defined - fail_msg: "Must define vault_grafana_admin_password - use `ansible-playbook generate-passwords.yml` to generate a set of passwords" + - name: Skip plugin installation in configure mode + # done during fatimage - can't do this in vars block as that is recursive + ansible.builtin.set_fact: + grafana_plugins: "{{ [] if appliances_mode == 'configure' else grafana_plugins }}" + - name: Copy Grafana plugins installed in image into persistent grafana state + ansible.builtin.copy: + remote_src: true + src: /var/lib/grafana/plugins/ # trailing / means copy contents + dest: "{{ grafana_data_dir }}/plugins/" + # below matches what already exists: + owner: root + group: root + mode: '0755' + become: true - include_role: name: cloudalchemy.grafana vars: - # We use internal roles to register the dashboards as the role does not support all options that we require. + # Internal role used to install dashboards as cloudalchemy role does not support all required options: grafana_dashboards: [] - - import_role: # done in same play so it can use handlers from cloudalchemy.grafana + - include_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards + when: "appliances_mode != 'configure'" - name: Deploy alertmanager hosts: alertmanager diff --git a/ansible/portal.yml b/ansible/portal.yml index 2aa646ae9..d34beca57 100644 --- a/ansible/portal.yml +++ b/ansible/portal.yml @@ -5,6 +5,10 @@ become: yes gather_facts: yes # TODO tasks: + - name: Skip openondemand apps installation in configure mode + set_fact: + ood_install_apps: {} + when: appliances_mode == 'configure' - import_role: name: openondemand tasks_from: main.yml @@ -19,6 +23,7 @@ - import_role: name: openondemand tasks_from: vnc_compute.yml + when: appliances_mode != 'configure' # is run during build - hosts: openondemand_jupyter tags: @@ -30,3 +35,4 @@ - import_role: name: openondemand tasks_from: jupyter_compute.yml + when: appliances_mode != 'configure' # is run during build diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 2dbacc262..9f8abe6d9 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,5 +1,5 @@ --- -- name: Disable Pulp repos +- name: Remove password and disable Pulp repos ansible.builtin.yum_repository: file: "{{ item.file }}" name: "{{ item.name }}" @@ -8,7 +8,7 @@ enabled: false loop: "{{ dnf_repos_repolist }}" -- name: Disable EPEL repo +- name: Remove password and disable EPEL repo ansible.builtin.yum_repository: name: epel file: epel @@ -16,3 +16,17 @@ baseurl: "{{ dnf_repos_epel_baseurl }}" gpgcheck: false enabled: false + +- name: Get all repo files + ansible.builtin.find: + paths: /etc/yum.repos.d + patterns: '*.repo' + register: _dnf_repo_files + +- name: Disable every repo + ansible.builtin.replace: + path: "{{ item.path }}" + regexp: '^enabled\ ?=\ ?1' + replace: 'enabled=0' + backup: yes + loop: "{{ _dnf_repo_files.files }}" diff --git a/ansible/roles/eessi/tasks/configure.yml b/ansible/roles/eessi/tasks/configure.yml new file mode 100644 index 000000000..b3083761c --- /dev/null +++ b/ansible/roles/eessi/tasks/configure.yml @@ -0,0 +1,16 @@ +--- + +- name: Add base CVMFS config + community.general.ini_file: + dest: /etc/cvmfs/default.local + section: null + option: "{{ item.key }}" + value: "{{ item.value }}" + no_extra_spaces: true + loop: "{{ cvmfs_config | dict2items }}" + + +# NOTE: Not clear how to make this idempotent +- name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/install.yml similarity index 73% rename from ansible/roles/eessi/tasks/main.yaml rename to ansible/roles/eessi/tasks/install.yml index 91dd54887..a4adb0b47 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/install.yml @@ -1,4 +1,5 @@ --- + - name: Download Cern GPG key ansible.builtin.get_url: url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM @@ -31,18 +32,3 @@ # - name: Install EESSI CVMFS config # dnf: # name: cvmfs-config-eessi - -- name: Add base CVMFS config - community.general.ini_file: - dest: /etc/cvmfs/default.local - section: null - option: "{{ item.key }}" - value: "{{ item.value }}" - no_extra_spaces: true - loop: "{{ cvmfs_config | dict2items }}" - - -# NOTE: Not clear how to make this idempotent -- name: Ensure CVMFS config is setup - command: - cmd: "cvmfs_config setup" diff --git a/ansible/roles/eessi/tasks/main.yml b/ansible/roles/eessi/tasks/main.yml new file mode 100644 index 000000000..79d326ceb --- /dev/null +++ b/ansible/roles/eessi/tasks/main.yml @@ -0,0 +1,4 @@ +--- + +- include_tasks: install.yml +- include_tasks: configure.yml diff --git a/ansible/roles/fail2ban/tasks/configure.yml b/ansible/roles/fail2ban/tasks/configure.yml new file mode 100644 index 000000000..e4951f726 --- /dev/null +++ b/ansible/roles/fail2ban/tasks/configure.yml @@ -0,0 +1,15 @@ +--- +- name: Create config + template: + dest: /etc/fail2ban/jail.local + src: jail.local.j2 + notify: Restart fail2ban + +- name: flush handlers + meta: flush_handlers + +- name: Ensure fail2ban running even if no config change + service: + name: fail2ban + state: started + enabled: true diff --git a/ansible/roles/fail2ban/tasks/install.yml b/ansible/roles/fail2ban/tasks/install.yml new file mode 100644 index 000000000..65f3bfef2 --- /dev/null +++ b/ansible/roles/fail2ban/tasks/install.yml @@ -0,0 +1,11 @@ +--- +- name: Install EPEL repo + package: + name: epel-release + +- name: Install fail2ban packages + package: + name: + - fail2ban-server + - fail2ban-firewalld + state: present diff --git a/ansible/roles/fail2ban/tasks/main.yml b/ansible/roles/fail2ban/tasks/main.yml index 244a2edf9..410e9436d 100644 --- a/ansible/roles/fail2ban/tasks/main.yml +++ b/ansible/roles/fail2ban/tasks/main.yml @@ -1,26 +1,4 @@ --- -- name: Install EPEL repo - package: - name: epel-release -- name: Install fail2ban packages - package: - name: - - fail2ban-server - - fail2ban-firewalld - state: present - -- name: Create config - template: - dest: /etc/fail2ban/jail.local - src: jail.local.j2 - notify: Restart fail2ban - -- name: flush handlers - meta: flush_handlers - -- name: Ensure fail2ban running even if no config change - service: - name: fail2ban - state: started - enabled: true +- import_tasks: install.yml +- import_tasks: configure.yml diff --git a/ansible/roles/filebeat/defaults/main.yml b/ansible/roles/filebeat/defaults/main.yml index 4b4220a69..bdd02a2b7 100644 --- a/ansible/roles/filebeat/defaults/main.yml +++ b/ansible/roles/filebeat/defaults/main.yml @@ -1,6 +1,8 @@ --- #filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template +filebeat_debug: false + +# Note all the below can only be set/changed using the install.yml task file: filebeat_podman_user: "{{ ansible_user }}" # User that runs the filebeat container filebeat_version: 7.12.1 # latest usable with opensearch - see https://opensearch.org/docs/2.4/tools/index/#compatibility-matrix-for-beats -filebeat_debug: false diff --git a/ansible/roles/podman/tasks/config.yml b/ansible/roles/podman/tasks/configure.yml similarity index 100% rename from ansible/roles/podman/tasks/config.yml rename to ansible/roles/podman/tasks/configure.yml diff --git a/ansible/roles/podman/tasks/prereqs.yml b/ansible/roles/podman/tasks/install.yml similarity index 100% rename from ansible/roles/podman/tasks/prereqs.yml rename to ansible/roles/podman/tasks/install.yml diff --git a/ansible/roles/podman/tasks/main.yml b/ansible/roles/podman/tasks/main.yml new file mode 100644 index 000000000..2b65e84b4 --- /dev/null +++ b/ansible/roles/podman/tasks/main.yml @@ -0,0 +1,2 @@ +- import_tasks: install.yml +- import_tasks: configure.yml diff --git a/ansible/roles/proxy/README.md b/ansible/roles/proxy/README.md index 6d51fd9d4..19e947c31 100644 --- a/ansible/roles/proxy/README.md +++ b/ansible/roles/proxy/README.md @@ -4,8 +4,19 @@ Define http/s proxy configuration. ## Role variables -- `proxy_http_proxy`: Required. Address of http proxy. E.g. "http://10.1.0.28:3128" for a Squid proxy on default port. -- `proxy_https_proxy`: Optional. Address of https proxy. Default is `{{ proxy_http_proxy }}`. -- `proxy_no_proxy_extra`: Optional. List of additional addresses not to proxy. Will be combined with default list which includes `inventory_hostname` (for hostnames) and `ansible_host` (for host IPs) for all Ansible hosts. -- `proxy_dnf`: Optional bool. Whether to configure yum/dnf proxying through `proxy_http_proxy`. Default `true`. -- `proxy_systemd`: Optional bool. Whether to give processes started by systemd the above http, https and no_proxy configuration. **NB** Running services will need restarting if this is changed. Default `true`. +- `proxy_http_proxy`: Required str. Address of http proxy, e.g. `'http://squid.mysite.org:3128`'. + **NB:** If the `squid` group is enabled, this defaults to the address of the + first host in that group and the configured port. See `environments/common/inventory/group_vars/all/proxy.yml` + for other convenience variables to configure this. +- `proxy_https_proxy`: Optional string. Address of https proxy. Default is `{{ proxy_http_proxy }}`. +- `proxy_no_proxy_extra`: Optional list. Additional addresses not to proxy. Will + be combined with default list which includes `inventory_hostname` (for hostnames) + and `ansible_host` (for host IPs) for all Ansible hosts. +- `proxy_dnf`: Optional bool. Whether to configure yum/dnf proxying through `proxy_http_proxy`. + Default `true`. +- `proxy_systemd`: Optional bool. Whether to give processes started by systemd + the above http, https and no_proxy configuration. **NB** Running services will + need restarting if this is changed. Default `true`. +- `proxy_remove`: Optional bool. Whether to remove the proxy configuration at + the end of the `site.yml` playbook (this is actually a variable on the play, + not a role variable). diff --git a/ansible/roles/proxy/defaults/main.yml b/ansible/roles/proxy/defaults/main.yml index fd2b079ec..cece7367c 100644 --- a/ansible/roles/proxy/defaults/main.yml +++ b/ansible/roles/proxy/defaults/main.yml @@ -5,3 +5,5 @@ proxy_no_proxy_extras: [] proxy_no_proxy: "{{ (proxy_no_proxy_defaults + proxy_no_proxy_extras) | unique | sort | join(',') }}" proxy_dnf: true proxy_systemd: true +proxy_state: present +# proxy_remove: false diff --git a/ansible/roles/proxy/tasks/main.yml b/ansible/roles/proxy/tasks/main.yml index 70a7eca67..c3637fc21 100644 --- a/ansible/roles/proxy/tasks/main.yml +++ b/ansible/roles/proxy/tasks/main.yml @@ -1,3 +1,9 @@ +- name: Validate http_proxy definition + ansible.builtin.assert: + that: proxy_http_proxy != '' # this is default if squid not active + fail_msg: >- + Variable proxy_http_proxy cannot be the empty string for hosts in the + proxy group. See environment/common/inventory/group_vars/all/proxy.yml. - name: Define configuration in /etc/environment tags: proxy lineinfile: @@ -6,7 +12,7 @@ owner: root group: root mode: o=rw,go=r - state: present + state: "{{ proxy_state }}" regexp: "{{ item.key }}=.*" line: "{{ item.key }}={{ item.value }}" loop: @@ -24,6 +30,7 @@ option: "proxy" value: "{{ proxy_http_proxy }}" no_extra_spaces: true + state: "{{ proxy_state }}" owner: root group: root mode: o=rw,go=r @@ -48,6 +55,7 @@ "https_proxy={{ proxy_http_proxy }}" "no_proxy={{ proxy_no_proxy }}" no_extra_spaces: true + state: "{{ proxy_state }}" owner: root group: root mode: ug=rw,o=r diff --git a/ansible/roles/rebuild/tasks/configure.yml b/ansible/roles/rebuild/tasks/configure.yml new file mode 100644 index 000000000..78a3b7b55 --- /dev/null +++ b/ansible/roles/rebuild/tasks/configure.yml @@ -0,0 +1,17 @@ +--- + +- name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: slurm + group: root + mode: u=rX,g=rwX + +- name: Copy out clouds.yaml + copy: + src: "{{ rebuild_clouds_path }}" + dest: /etc/openstack/clouds.yaml + owner: slurm + group: root + mode: u=r,g=rw diff --git a/ansible/roles/rebuild/tasks/install.yml b/ansible/roles/rebuild/tasks/install.yml new file mode 100644 index 000000000..1152426e6 --- /dev/null +++ b/ansible/roles/rebuild/tasks/install.yml @@ -0,0 +1,3 @@ +- name: Setup slurm tools + include_role: + name: slurm_tools diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index 5612ab515..79d326ceb 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,21 +1,4 @@ --- -- name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: slurm - group: root - mode: u=rX,g=rwX - -- name: Copy out clouds.yaml - copy: - src: "{{ rebuild_clouds_path }}" - dest: /etc/openstack/clouds.yaml - owner: slurm - group: root - mode: u=r,g=rw - -- name: Setup slurm tools - include_role: - name: slurm_tools +- include_tasks: install.yml +- include_tasks: configure.yml diff --git a/ansible/roles/slurm_exporter/tasks/configure.yml b/ansible/roles/slurm_exporter/tasks/configure.yml new file mode 100644 index 000000000..e511be02b --- /dev/null +++ b/ansible/roles/slurm_exporter/tasks/configure.yml @@ -0,0 +1,7 @@ +- name: Ensure slurm exporter state + systemd: + name: prometheus-slurm-exporter + state: "{{ slurm_exporter_state }}" + enabled: true + when: + - not ansible_check_mode diff --git a/ansible/roles/slurm_exporter/tasks/install.yml b/ansible/roles/slurm_exporter/tasks/install.yml index 49ee57fef..cba7aa95b 100644 --- a/ansible/roles/slurm_exporter/tasks/install.yml +++ b/ansible/roles/slurm_exporter/tasks/install.yml @@ -6,10 +6,3 @@ - meta: flush_handlers -- name: Ensure slurm exporter state - systemd: - name: prometheus-slurm-exporter - state: "{{ slurm_exporter_state }}" - enabled: true - when: - - not ansible_check_mode diff --git a/ansible/roles/slurm_exporter/tasks/main.yml b/ansible/roles/slurm_exporter/tasks/main.yml index 52b260f07..0171113a1 100644 --- a/ansible/roles/slurm_exporter/tasks/main.yml +++ b/ansible/roles/slurm_exporter/tasks/main.yml @@ -1,2 +1,3 @@ --- - import_tasks: install.yml +- import_tasks: configure.yml diff --git a/ansible/roles/slurm_stats/tasks/configure.yml b/ansible/roles/slurm_stats/tasks/configure.yml new file mode 100644 index 000000000..6bd87b276 --- /dev/null +++ b/ansible/roles/slurm_stats/tasks/configure.yml @@ -0,0 +1,30 @@ +--- + +- name: Create a directory to house the log files + file: + state: directory + path: /var/log/slurm-stats + become: true + +- name: Create cron job + cron: + name: Generate slurm stats + minute: "*/5" + user: root + # NOTE: lasttimestamp is stored at /root/lasttimestamp + job: "TZ=UTC /opt/slurm-tools/bin/slurm-stats >> /var/log/slurm-stats/finished_jobs.json" + cron_file: slurm-stats + become: true + +- name: Setup log rotate + copy: + content: | + # WARNING: This file is managed by ansible, do not modify. + /var/log/slurm-stats/finished_jobs.json { + {{ slurm_stats_log_rotate_content_frequency }} + rotate {{ slurm_stats_log_rotate_content_rotate }} + compress + delaycompress + } + dest: /etc/logrotate.d/slurm-stats + become: true diff --git a/ansible/roles/slurm_stats/tasks/install.yml b/ansible/roles/slurm_stats/tasks/install.yml new file mode 100644 index 000000000..748272eb6 --- /dev/null +++ b/ansible/roles/slurm_stats/tasks/install.yml @@ -0,0 +1,5 @@ +--- + +- name: Setup slurm tools + include_role: + name: slurm_tools diff --git a/ansible/roles/slurm_stats/tasks/main.yml b/ansible/roles/slurm_stats/tasks/main.yml index 6f02405c6..79d326ceb 100644 --- a/ansible/roles/slurm_stats/tasks/main.yml +++ b/ansible/roles/slurm_stats/tasks/main.yml @@ -1,34 +1,4 @@ --- -- name: Setup slurm tools - include_role: - name: slurm_tools - -- name: Create a directory to house the log files - file: - state: directory - path: /var/log/slurm-stats - become: true - -- name: Create cron job - cron: - name: Generate slurm stats - minute: "*/5" - user: root - # NOTE: lasttimestamp is stored at /root/lasttimestamp - job: "TZ=UTC /opt/slurm-tools/bin/slurm-stats >> /var/log/slurm-stats/finished_jobs.json" - cron_file: slurm-stats - become: true - -- name: Setup log rotate - copy: - content: | - # WARNING: This file is managed by ansible, do not modify. - /var/log/slurm-stats/finished_jobs.json { - {{ slurm_stats_log_rotate_content_frequency }} - rotate {{ slurm_stats_log_rotate_content_rotate }} - compress - delaycompress - } - dest: /etc/logrotate.d/slurm-stats - become: true +- include_tasks: install.yml +- include_tasks: configure.yml diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 1583f97ba..fd3424023 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -16,8 +16,9 @@ - rebuild - openhpc tasks: - - import_role: + - include_role: name: rebuild + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Set locked memory limits on user-facing nodes hosts: diff --git a/docs/experimental/isolated-clusters.md b/docs/experimental/isolated-clusters.md new file mode 100644 index 000000000..304b625d2 --- /dev/null +++ b/docs/experimental/isolated-clusters.md @@ -0,0 +1,164 @@ +# Isolated Clusters + +Full functionality of the appliance requires that there is outbound internet +access from all nodes, possibly via a [proxy](../../ansible/roles/proxy/). + +However many features (as defined by Ansible inventory groups/roles) will work +if the cluster network(s) provide no outbound access. Currently this includes +all "default" features, i.e. roles/groups which are enabled either in the +`common` environment or in the `environments/$ENV/inventory/groups` file +created by cookiecutter for a new environment. + +The full list of features and whether they are functional on such an "isolated" +network is shown in the table below. Note that: + +1. The `hpl` test from the `ansible/adhoc/hpctests.yml` playbook is not + functional and must be skipped using: + + ```shell + ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo + ``` + +2. Using [EESSI](https://www.eessi.io/docs/) necessarily requires outbound + network access for the CernVM File System. However this can be provided + via an authenticated proxy. While the proxy configuration on the cluster node + is readable by all users, this proxy could be limited via acls to only provide + access to EESSI's CVMFS Stratum 1 servers. + +## Support by feature for isolated networks + +See above for definition of "Default" features. In the "Isolated?" column: +- "Y": Feature works without outbound internet access. +- "N": Known not to work. +- "?": Not investigated at present. + +| Inventory group/role | Default? | Isolated? | +| ----------------------| -------- | --------- | +| alertmanager | Y | Y | +| ansible_init | Y | Y | +| basic_users | Y | Y | +| block_devices | Y | No (depreciated) | +| cacerts | - | Y | +| chrony | - | Y | +| compute_init | - | Y | +| cuda | - | ? | +| eessi | Y | Y - see above | +| etc_hosts | Y | Y | +| extra_packages | - | No | +| fail2ban | Y | Y | +| filebeat | Y | Y | +| firewalld | Y | Y | +| freeipa_client | - | Y - image build required | +| gateway | n/a | n/a - build only | +| grafana | Y | Y | +| hpctests | Y | Y - except hpl-solo, see above | +| k3s_agent | - | ? | +| k3s_server | - | ? | +| k9s | - | ? | +| lustre | - | ? | +| manila | Y | Y | +| mysql | Y | Y | +| nfs | Y | Y | +| nhc | Y | Y | +| node_exporter | Y | Y | +| openhpc | Y | Y | +| openondemand | Y | Y | +| openondemand_desktop | Y | Y | +| openondemand_jupyter | Y | Y | +| opensearch | Y | Y | +| podman | Y | Y | +| persist_hostkeys | Y | Y | +| prometheus | Y | Y | +| proxy | - | Y | +| resolv_conf | - | ? | +| slurm_exporter | Y | Y | +| slurm_stats | Y | Y | +| squid | - | ? | +| sshd | - | ? | +| sssd | - | ? | +| systemd | Y | Y | +| tuned | - | Y | +| update | - | No | + +## Image build +A site image build may be required, either for features using packages not +present in StackHPC images (e.g `freeipa_client`) or to [add additional packages](../operations.md#adding-additional-packages). +Clearly in this case the build VM does require outbound internet access. For an +"isolated" environment, this could be achieved by [configuring image build](../image-build.md) +to use a different network from the cluster. Alternatively if an authenticated +proxy is available the image build can be configured to use that, e.g.: + +```yaml +# environments/$ENV/builder.pkrvars.hcl: +... +inventory_groups = 'proxy,freeipa_client' +``` + +```yaml +# environments/$ENV/group_vars/builder/overrrides.yml: +proxy_basic_user: someuser +proxy_basic_password: "{{ vault_proxy_basic_password }}" +proxy_http_address: squid.mysite.org +``` + +```yaml +# environments/$ENV/group_vars/builder/vault_overrrides.yml: +# NB: vault-encrypt this file +vault_proxy_basic_password: 'super-secret-password' +``` + +See [ansible/roles/proxy/README.md](../../ansible/roles/proxy/README.md) and +the convenience variables at +[environments/common/inventory/group_vars/all/proxy.yml](../../environments/common/inventory/group_vars/all/proxy.yml). + +By default, the proxy configuration will be removed at the end of the build and +hence will not be present in the image. + +## Network considerations + +Even when outbound internet access is not required, nodes do require some +outbound access, as well as connectivity inbound from the deploy host and +inbound connectivity for users. This section documents the minimal connectivity +required, in the form of the minimally-permissive security group rules. Often +default security groups are less restrictive than these. + +Assuming nodes and the deploy host have a security group `isolated` applied then +the following rules are required: + + # allow outbound DNS + ALLOW IPv4 53/tcp to 0.0.0.0/0 + ALLOW IPv4 53/udp to 0.0.0.0/0 + + # allow everything within the cluster: + ALLOW IPv4 from isolated + ALLOW IPv4 to isolated + + # allow hosts to reach metadata server (e.g. for cloud-init keys): + ALLOW IPv4 80/tcp to 169.254.169.254/32 + + # optionally: allow hosts to reach squid proxy for EESSI: + ALLOW IPv4 3128/tcp to + +Note that name resolution happens on the hosts, not on the proxy, hence DNS is +required for nodes even with a proxy. + +For nodes running OpenOndemand, inbound ssh and https are also required +(e.g. in a security group called `isolated-ssh-https`): + + ALLOW IPv4 443/tcp from 0.0.0.0/0 + ALLOW IPv4 22/tcp from 0.0.0.0/0 + +If non-default security groups are required, then the OpenTofu variables +`login_security_groups` and `nonlogin_security_groups` can be used to set +these, e.g.: + +```terraform +# environments/site/tofu/cluster.auto.tfvars: +login_security_groups = [ + "isolated", # allow all in-cluster services + "isolated-ssh-https", # access via ssh and ondemand +] +nonlogin_security_groups = [ + "isolated" +] +``` diff --git a/docs/networks.md b/docs/networks.md index bd10c380d..69b7ecec8 100644 --- a/docs/networks.md +++ b/docs/networks.md @@ -6,9 +6,10 @@ subnets or associated infrastructure such as routers. The requirements are that: 2. The first network defined spans all nodes, referred to as the "access network". 3. Only one subnet per network is attached to nodes. 4. At least one network on each node provides outbound internet access (either -directly, or via a proxy). + directly, or via a proxy). In some cases this can be relaxed - see + [docs/experimental/isolated-clusters.md](./experimental/isolated-clusters.md). -Addresses on the "access network" used as the `ansible_host` IPs. +Addresses on the "access network" are used as the `ansible_host` IPs. It is recommended that the deploy host either has a direct connection to the "access network" or jumps through a host on it which is not part of the appliance. diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 16c438b0a..dec31368a 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250624-0854-75099868", - "RL9": "openhpc-RL9-250624-0854-75099868" + "RL8": "openhpc-RL8-250704-1445-ff88ca4e", + "RL9": "openhpc-RL9-250704-1445-ff88ca4e" } } diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 2df138072..f2998f4ab 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -82,11 +82,14 @@ openondemand_clusters_grafana: host: "host" jobid: "jobid" -ood_install_apps_defaults: +# define installation of openondemand apps (see openondemand_apps_* below for configuration): +openondemand_install_apps_default: jupyter: repo: https://github.com/OSC/bc_example_jupyter.git version: master # defaults (optional) -ood_install_apps: "{{ ood_install_apps_defaults if openondemand_jupyter_partition | default(none) else {} }}" +openondemand_install_apps_extra: {} # mapping, values as for ansible.builtin.git: repo (required)/dest/version(default main)/umask +# osc:ood role var (NB only active when not in configure): +ood_install_apps: "{{ openondemand_install_apps_default if openondemand_jupyter_partition | default(none) else {} | combine(openondemand_install_apps_extra) }}" # https://github.com/OSC/ondemand/tree/master/apps/bc_desktop # also https://osc.github.io/ood-documentation/latest/enable-desktops/custom-job-submission.html#enable-desktops-custom-job-submission diff --git a/environments/common/inventory/group_vars/all/proxy.yml b/environments/common/inventory/group_vars/all/proxy.yml index d606ee1d9..f4914795e 100644 --- a/environments/common/inventory/group_vars/all/proxy.yml +++ b/environments/common/inventory/group_vars/all/proxy.yml @@ -1,2 +1,31 @@ -# default proxy address to first squid api address port 3128 if squid group non-empty, else empty string to avoid breaking hostvars -proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + (squid_http_port | string) if groups['squid'] else '' }}" +# If squid group is non-empty, default the proxy address to the hostname of +# the first squid host, port 3128. Else empty string to avoid breaking hostvars + +# Any of these defaults may be overriden in e.g. environments/site/group_vars/all/proxy.yml if required. + +# override if basic_auth is required: +proxy_basic_user: '' +proxy_basic_password: '' + +# hostname/address of proxy - override if not using appliance squid group/role: +proxy_http_address: "{{ (hostvars[groups['squid'].0 | default(None)] | default({})).api_address | default('') }}" + +# port of proxy - override if not using appliance squid group/role: +proxy_http_port: "{{ squid_http_port }}" + +# whether to remove the proxy configuration at the end of Ansible runs: +# (useful for image build where this should not be captured in the image) +proxy_remove: false +# NB for the `builder` group this defaults to true + +# full http proxy string - override if the above don't provide enough control: +proxy_http_proxy: >- + {% if groups['squid'] | length > 0 %} + http:// + {%- if proxy_basic_password -%} + {{ proxy_basic_user }}:{{ proxy_basic_password }}@ + {%- endif -%} + {{ proxy_http_address }}:{{ proxy_http_port }} + {% else %} + + {% endif %} diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index dae4edd9a..dc28e44f5 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -24,4 +24,6 @@ tuned_started: false tuned_enabled: false sssd_started: false sssd_enabled: false +slurm_exporter_state: stopped appliances_mode: build +proxy_remove: true diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 5590c8bb6..b7a7035e6 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -22,6 +22,9 @@ control [slurm_stats:children] control +[slurm_exporter:children] +control + [filebeat:children] slurm_stats @@ -56,6 +59,9 @@ compute # Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md cluster +[freeipa_client] +# Hosts to be a FreeIPA client. See ansible/roles/freeipa/README.md + [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md