From 5a883fea0c8261880a073dd8330306c22055fba9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 13:11:01 +0000 Subject: [PATCH 1/9] document which roles require an image build for compute_init --- ansible/roles/compute_init/README.md | 122 ++++++++++++++------------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index d016c7168..17d9fe986 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -27,63 +27,65 @@ To enable this: ## Supported appliance functionalities -The string "compute" must be present in the `compute_init_enable` flag to enable -this functionality. The table below shows which other appliance functionalities -are currently supported - use the name in the role column to enable these. - -| Playbook | Role (or functionality) | Support | -| -------------------------|-------------------------|-----------------| -| hooks/pre.yml | ? | None at present | -| validate.yml | n/a | Not relevant during boot | -| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | -| bootstrap.yml | resolv_conf | Fully supported | -| bootstrap.yml | etc_hosts | Fully supported | -| bootstrap.yml | proxy | None at present | -| bootstrap.yml | (/etc permissions) | None required - use image build | -| bootstrap.yml | (ssh /home fix) | None required - use image build | -| bootstrap.yml | (system users) | None required - use image build | -| bootstrap.yml | systemd | None required - use image build | -| bootstrap.yml | selinux | None required - use image build | -| bootstrap.yml | sshd | None at present | -| bootstrap.yml | dnf_repos | None at present (requirement TBD) | -| bootstrap.yml | squid | Not relevant for compute nodes | -| bootstrap.yml | tuned | None | -| bootstrap.yml | freeipa_server | Not relevant for compute nodes | -| bootstrap.yml | cockpit | None required - use image build | -| bootstrap.yml | firewalld | Not relevant for compute nodes | -| bootstrap.yml | fail2ban | Not relevant for compute nodes | -| bootstrap.yml | podman | Not relevant for compute nodes | -| bootstrap.yml | update | Not relevant during boot | -| bootstrap.yml | reboot | Not relevant for compute nodes | -| bootstrap.yml | ofed | Not relevant during boot | -| bootstrap.yml | ansible_init (install) | Not relevant during boot | -| bootstrap.yml | k3s (install) | Not relevant during boot | -| hooks/post-bootstrap.yml | ? | None at present | -| iam.yml | freeipa_client | None at present [1] | -| iam.yml | freeipa_server | Not relevant for compute nodes | -| iam.yml | sssd | None at present | -| filesystems.yml | block_devices | None required - role deprecated | -| filesystems.yml | nfs | All client functionality | -| filesystems.yml | manila | All functionality | -| filesystems.yml | lustre | None at present | -| extras.yml | basic_users | All functionality [2] | -| extras.yml | eessi | All functionality [3] | -| extras.yml | cuda | None required - use image build [4] | -| extras.yml | persist_hostkeys | Not expected to be required for compute nodes | -| extras.yml | compute_init (export) | Not relevant for compute nodes | -| extras.yml | k9s (install) | Not relevant during boot | -| extras.yml | extra_packages | None at present. Would require dnf_repos | -| slurm.yml | mysql | Not relevant for compute nodes | -| slurm.yml | rebuild | Not relevant for compute nodes | -| slurm.yml | openhpc [5] | All slurmd-related functionality | -| slurm.yml | (set memory limits) | None at present | -| slurm.yml | (block ssh) | None at present | -| portal.yml | (openondemand server) | Not relevant for compute nodes | -| portal.yml | (openondemand vnc desktop) | None required - use image build | -| portal.yml | (openondemand jupyter server) | None required - use image build | -| monitoring.yml | (all monitoring) | None at present [6] | -| disable-repos.yml | dnf_repos | None at present (requirement TBD) | -| hooks/post.yml | ? | None at present | +In the table below, if a role is marked as supported then its functionality +can be enabled during boot by adding the role name to the `compute_init_enable` +property described above. If a role is marked as requiring a custom image then +it also requires an image build with the role name added to the +[Packer inventory_groups variable](../../../docs/image-build.md). + +| Playbook | Role (or functionality) | Support | Custom image reqd.? | +| -------------------------|-------------------------|---------------------------------|---------------------| +| hooks/pre.yml | ? | None at present | n/a | +| validate.yml | n/a | Not relevant during boot | n/a | +| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a | +| bootstrap.yml | resolv_conf | Fully supported | No | +| bootstrap.yml | etc_hosts | Fully supported | No | +| bootstrap.yml | proxy | None at present | No | +| bootstrap.yml | (/etc permissions) | None required - use image build | No | +| bootstrap.yml | (ssh /home fix) | None required - use image build | No | +| bootstrap.yml | (system users) | None required - use image build | No | +| bootstrap.yml | systemd | None required - use image build | No | +| bootstrap.yml | selinux | None required - use image build | Maybe [7] | +| bootstrap.yml | sshd | None at present | No | +| bootstrap.yml | dnf_repos | None at present [8] | - | +| bootstrap.yml | squid | Not relevant for compute nodes | n/a | +| bootstrap.yml | tuned | None | - | +| bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | +| bootstrap.yml | cockpit | None required - use image build | No | +| bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | +| bootstrap.yml | fail2ban | Not relevant for compute nodes | n/a | +| bootstrap.yml | podman | Not relevant for compute nodes | n/a | +| bootstrap.yml | update | Not relevant during boot | n/a | +| bootstrap.yml | reboot | Not relevant for compute nodes | n/a | +| bootstrap.yml | ofed | Not relevant during boot | Yes | +| bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a | +| bootstrap.yml | k3s (install) | Not relevant during boot | n/a | +| hooks/post-bootstrap.yml | ? | None at present | n/a | +| iam.yml | freeipa_client | None at present [1] | Yes | +| iam.yml | freeipa_server | Not relevant for compute nodes | n/a | +| iam.yml | sssd | None at present | No | +| filesystems.yml | block_devices | None required - role deprecated | n/a | +| filesystems.yml | nfs | All client functionality | No | +| filesystems.yml | manila | All functionality | No [10] | +| filesystems.yml | lustre | None at present | Yes | +| extras.yml | basic_users | All functionality [2] | No | +| extras.yml | eessi | All functionality [3] | No | +| extras.yml | cuda | None required - use image build | Yes [4] | +| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | +| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | +| extras.yml | k9s (install) | Not relevant during boot | n/a | +| extras.yml | extra_packages | None at present [9] | - | +| slurm.yml | mysql | Not relevant for compute nodes | n/a | +| slurm.yml | rebuild | Not relevant for compute nodes | n/a | +| slurm.yml | openhpc [5] | All slurmd functionality | No | +| slurm.yml | (set memory limits) | None at present | - | +| slurm.yml | (block ssh) | None at present | - | +| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | +| portal.yml | (openondemand vnc desktop) | None required - use image build | No | +| portal.yml | (openondemand jupyter server) | None required - use image build | No | +| monitoring.yml | (all monitoring) | None at present [6] | - | +| disable-repos.yml | dnf_repos | None at present [8] | - | +| hooks/post.yml | ? | None at present | - | Notes: @@ -95,7 +97,11 @@ Notes: and will start during boot. 5. `openhpc` does not need to be added to `compute_init_enable`, this is automatically enabled by adding `compute`. -5. Only node-exporter tasks are relevant, and will be done via k3s in a future release. +6. Only node-exporter tasks are relevant, and will be done via k3s in a future release. +7. `selinux` is set to disabled in StackHPC images. +8. Requirement TBD +9. Would require dnf_repos +10. Assuming default cephfs version ## Approach From 3024f70e2e01f679a6341b14685888a24541d81a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 13:20:35 +0000 Subject: [PATCH 2/9] fix compute-init docs for node-exporter --- ansible/roles/compute_init/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 17d9fe986..12cb8789f 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -83,7 +83,8 @@ it also requires an image build with the role name added to the | portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | | portal.yml | (openondemand vnc desktop) | None required - use image build | No | | portal.yml | (openondemand jupyter server) | None required - use image build | No | -| monitoring.yml | (all monitoring) | None at present [6] | - | +| monitoring.yml | node_exporter | None required - use image build | No | +| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | | disable-repos.yml | dnf_repos | None at present [8] | - | | hooks/post.yml | ? | None at present | - | From aa880f3c87315ff1ab1bac5f9395f005d7f0cd96 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 13:26:21 +0000 Subject: [PATCH 3/9] reorder compute-init docs notes --- ansible/roles/compute_init/README.md | 40 +++++++++++++--------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 12cb8789f..e8171ea3e 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -45,9 +45,9 @@ it also requires an image build with the role name added to the | bootstrap.yml | (ssh /home fix) | None required - use image build | No | | bootstrap.yml | (system users) | None required - use image build | No | | bootstrap.yml | systemd | None required - use image build | No | -| bootstrap.yml | selinux | None required - use image build | Maybe [7] | +| bootstrap.yml | selinux | None required - use image build | Maybe [1] | | bootstrap.yml | sshd | None at present | No | -| bootstrap.yml | dnf_repos | None at present [8] | - | +| bootstrap.yml | dnf_repos | None at present [2] | - | | bootstrap.yml | squid | Not relevant for compute nodes | n/a | | bootstrap.yml | tuned | None | - | | bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | @@ -61,23 +61,23 @@ it also requires an image build with the role name added to the | bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a | | bootstrap.yml | k3s (install) | Not relevant during boot | n/a | | hooks/post-bootstrap.yml | ? | None at present | n/a | -| iam.yml | freeipa_client | None at present [1] | Yes | +| iam.yml | freeipa_client | None at present [3] | Yes | | iam.yml | freeipa_server | Not relevant for compute nodes | n/a | | iam.yml | sssd | None at present | No | | filesystems.yml | block_devices | None required - role deprecated | n/a | | filesystems.yml | nfs | All client functionality | No | -| filesystems.yml | manila | All functionality | No [10] | +| filesystems.yml | manila | All functionality | No [4] | | filesystems.yml | lustre | None at present | Yes | -| extras.yml | basic_users | All functionality [2] | No | -| extras.yml | eessi | All functionality [3] | No | -| extras.yml | cuda | None required - use image build | Yes [4] | +| extras.yml | basic_users | All functionality [5] | No | +| extras.yml | eessi | All functionality [6] | No | +| extras.yml | cuda | None required - use image build | Yes [7] | | extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | | extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | | extras.yml | k9s (install) | Not relevant during boot | n/a | -| extras.yml | extra_packages | None at present [9] | - | +| extras.yml | extra_packages | None at present [8] | - | | slurm.yml | mysql | Not relevant for compute nodes | n/a | | slurm.yml | rebuild | Not relevant for compute nodes | n/a | -| slurm.yml | openhpc [5] | All slurmd functionality | No | +| slurm.yml | openhpc [9] | All slurmd functionality | No | | slurm.yml | (set memory limits) | None at present | - | | slurm.yml | (block ssh) | None at present | - | | portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | @@ -85,25 +85,23 @@ it also requires an image build with the role name added to the | portal.yml | (openondemand jupyter server) | None required - use image build | No | | monitoring.yml | node_exporter | None required - use image build | No | | monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | -| disable-repos.yml | dnf_repos | None at present [8] | - | +| disable-repos.yml | dnf_repos | None at present [2] | - | | hooks/post.yml | ? | None at present | - | Notes: -1. FreeIPA client functionality would be better provided using a client fork +1. `selinux` is set to disabled in StackHPC images. +2. Requirement for this functionality is TBD. +3. FreeIPA client functionality would be better provided using a client fork which uses pkinit keys rather than OTP to reenrol nodes. -2. Assumes home directory already exists on shared storage. -3. Assumes `cvmfs_config` is the same on control node and all compute nodes -4. If `cuda` role was run during build, the nvidia-persistenced is enabled +4. Assuming default Ceph client version. +5. Assumes home directory already exists on shared storage. +6. Assumes `cvmfs_config` is the same on control node and all compute nodes. +7. If `cuda` role was run during build, the nvidia-persistenced is enabled and will start during boot. -5. `openhpc` does not need to be added to `compute_init_enable`, this is +8. Would require `dnf_repos`. +9. `openhpc` does not need to be added to `compute_init_enable`, this is automatically enabled by adding `compute`. -6. Only node-exporter tasks are relevant, and will be done via k3s in a future release. -7. `selinux` is set to disabled in StackHPC images. -8. Requirement TBD -9. Would require dnf_repos -10. Assuming default cephfs version - ## Approach This works as follows: From 91497b6e6d200c2617f631f7f2cd200ba5e19507 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 13:36:37 +0000 Subject: [PATCH 4/9] add tuned into stackhpc image --- environments/.stackhpc/inventory/extra_groups | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 416d50566..7b15a4fb6 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -24,6 +24,10 @@ cluster login compute +[tuned:children] +# Install tuned into fat image +builder + [squid:children] # Install squid into fat image builder From 2650f0bb85d0eb3678d70319b1be6260fecd3598 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 15:34:32 +0000 Subject: [PATCH 5/9] match /mnt/cluster permissions before and after mount, so ansible-init reruns don't fail --- ansible/roles/compute_init/files/compute-init.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 430e2cf65..73bf65209 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -59,9 +59,9 @@ file: path: /mnt/cluster state: directory - owner: root + owner: slurm group: root - mode: u=rwX,go= # is sensitive + mode: u=rX,g=rwX,o= - name: Mount /mnt/cluster mount: From ba664f1125ea9e15664263805f1a445b5bc68e52 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 15:36:30 +0000 Subject: [PATCH 6/9] support tuned in ansible-init --- ansible/roles/compute_init/README.md | 2 +- ansible/roles/compute_init/files/compute-init.yml | 11 +++++++++++ ansible/roles/compute_init/tasks/install.yml | 2 ++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index e8171ea3e..70fa82229 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -49,7 +49,7 @@ it also requires an image build with the role name added to the | bootstrap.yml | sshd | None at present | No | | bootstrap.yml | dnf_repos | None at present [2] | - | | bootstrap.yml | squid | Not relevant for compute nodes | n/a | -| bootstrap.yml | tuned | None | - | +| bootstrap.yml | tuned | Fully supported | No | | bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | | bootstrap.yml | cockpit | None required - use image build | No | | bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 73bf65209..bbd0f029e 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -9,6 +9,7 @@ enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}" + enable_tuned: "{{ os_metadata.meta.tuned | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" @@ -17,6 +18,12 @@ # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] + tuned_profile_baremetal: hpc-compute + tuned_profile_vm: virtual-guest + tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}" + tuned_enabled: true + tuned_started: true + nfs_client_mnt_point: "/mnt" nfs_client_mnt_options: nfs_client_mnt_state: mounted @@ -125,6 +132,10 @@ mode: 0644 when: enable_etc_hosts + - name: Configure tuned + include_tasks: tasks/tuned.yml + when: enable_tuned + # NFS client mount - name: If nfs-clients is present include_tasks: tasks/nfs-clients.yml diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index bbcbf133f..77cddf0a8 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -32,6 +32,8 @@ dest: files/NetworkManager-dns-none.conf - src: ../../basic_users/filter_plugins/filter_keys.py dest: filter_plugins/filter_keys.py + - src: ../../tuned/tasks/configure.yml + dest: tasks/tuned.yml - src: ../../stackhpc.nfs/tasks/nfs-clients.yml dest: tasks/nfs-clients.yml From 158fe85313ae0d06028a4de0a43e9cb8e768815d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 15:36:59 +0000 Subject: [PATCH 7/9] enable tuned for stackhpc rebuilds --- environments/.stackhpc/tofu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index 91c3e178c..a84c3eefb 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -80,7 +80,7 @@ module "cluster" { standard: { # NB: can't call this default! nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor - compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] + compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi", "tuned"] ignore_image_changes: true } # Example of how to add another partition: From e6614bcb7e2feb4ebeb81b81512b9116ae8ce6fb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 15:37:39 +0000 Subject: [PATCH 8/9] allow injecting ark creds even during site.yml, for development use --- ansible/bootstrap.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 1d818b7cc..8d879b7d9 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -126,7 +126,9 @@ ansible.builtin.assert: that: dnf_repos_password is undefined fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' - when: appliances_mode == 'configure' + when: + - appliances_mode == 'configure' + - not (dnf_repos_allow_insecure_creds | default(false)) # useful for development - hosts: squid tags: squid From 2e0ef3768f323e6fbf28354a400793e061ba4fce Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 11 Feb 2025 16:29:43 +0000 Subject: [PATCH 9/9] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 46c784e13..99bca2f54 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250130-1126-8f2a7703", - "RL9": "openhpc-RL9-250130-1127-8f2a7703" + "RL8": "openhpc-RL8-250211-1540-a0b4a57e", + "RL9": "openhpc-RL9-250211-1540-a0b4a57e" } }