From 30984da5e62fa8c3649f1f4ee060b0113fee44ed Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Mar 2025 16:12:25 +0000 Subject: [PATCH 01/15] slurm controlled rebuild docs & adhoc support --- ansible/adhoc/reboot_via_slurm.yml | 24 -- ansible/adhoc/rebuild-via-slurm.yml | 17 + ansible/roles/compute_init/README.md | 11 +- ansible/roles/rebuild/README.md | 51 ++- ansible/roles/rebuild/defaults/main.yml | 23 +- ansible/roles/rebuild/tasks/main.yml | 2 +- ansible/roles/rebuild/tasks/rebuild.yml | 11 + .../roles/rebuild/tasks/rebuild_partition.yml | 21 ++ docs/experimental/slurm-controlled-rebuild.md | 310 ++++++++++++++++++ 9 files changed, 425 insertions(+), 45 deletions(-) delete mode 100644 ansible/adhoc/reboot_via_slurm.yml create mode 100644 ansible/adhoc/rebuild-via-slurm.yml create mode 100644 ansible/roles/rebuild/tasks/rebuild.yml create mode 100644 ansible/roles/rebuild/tasks/rebuild_partition.yml create mode 100644 docs/experimental/slurm-controlled-rebuild.md diff --git a/ansible/adhoc/reboot_via_slurm.yml b/ansible/adhoc/reboot_via_slurm.yml deleted file mode 100644 index b5d5d0d0f..000000000 --- a/ansible/adhoc/reboot_via_slurm.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Reboot compute nodes via slurm. Nodes will be rebuilt if `image_id` in inventory is different to the currently-provisioned image. -# Example: -# ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml - -- hosts: login - run_once: true - become: yes - gather_facts: no - tasks: - - name: Submit a Slurm job to reboot compute nodes - ansible.builtin.shell: | - set -e - srun --reboot -N 2 uptime - become_user: root - register: slurm_result - failed_when: slurm_result.rc != 0 - - - name: Fetch Slurm controller logs if reboot fails - ansible.builtin.shell: | - journalctl -u slurmctld --since "10 minutes ago" | tail -n 50 - become_user: root - register: slurm_logs - when: slurm_result.rc != 0 - delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml new file mode 100644 index 000000000..4f7b5a576 --- /dev/null +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -0,0 +1,17 @@ +# Rebuild compute nodes via slurm. +# Nodes will be rebuilt if `image_id` in inventory is different to the +# currently-provisioned image. Otherwise they are rebooted. + +# Example: +# ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml + +# See docs/slurm-controlled-rebuild.md. + +- hosts: login + run_once: true + gather_facts: no + tasks: + - name: Run slurm-controlled rebuild + import_role: + name: rebuild + tasks_from: rebuild.yml diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index e64ea6ffb..81a62bade 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -1,12 +1,11 @@ # EXPERIMENTAL: compute_init -Experimental functionality to allow compute nodes to rejoin the cluster after -a reboot without running the `ansible/site.yml` playbook. +Allow compute nodes to rejoin the cluster after a reboot without running the +`ansible/site.yml` playbook. -**CAUTION:** The approach used here of exporting cluster secrets over NFS -is considered to be a security risk due to the potential for cluster users to -mount the share on a user-controlled machine by tunnelling through a login -node. This feature should not be enabled on production clusters at this time. +> [!NOTE] +> This functionality is marked as experimental as it may be incomplete and the +> required configuration may change with further development. To enable this: 1. Add the `compute` group (or a subset) into the `compute_init` group. diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index 314d7c94d..ddce17933 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -1,30 +1,55 @@ rebuild ========= -Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git to be run from control node. +Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git +to be run from control node. Requirements ------------ -clouds.yaml file +An OpenStack clouds.yaml file containing credentials for a cloud under the +"openstack" key. Role Variables -------------- -- `openhpc_rebuild_clouds`: Directory. Path to clouds.yaml file. +The below is only used by this role's `main.yml` task file, i.e. when running +the `ansible/site.yml` or `ansible/slurm.yml` playbooks: +- `rebuild_clouds_path`: Optional. Path to `clouds.yaml` file on the deploy + host, default `~/.config/openstack/clouds.yaml`. -Example Playbook ----------------- +The below are only used by this role's `rebuild.yml` task file, i.e. when +running the `ansible/adhoc/rebuild-via-slurm.yml` playbook: - - hosts: control - become: yes - tasks: - - import_role: - name: rebuild +- `rebuild_job_partitions`: Optional. Comma-separated list of names of rebuild + partitions defined in `openhpc_slurm_partitions`. Useful as an extra-var for + limiting rebuilds. Default `rebuild`. -License -------- +- `rebuild_job_name`: Optional. Name of rebuild jobs. Default is `rebuild-` + suffixed with the node name. -Apache-2.0 +- `rebuild_job_command`: Optional. String giving command to run in job after + node has been rebuilt. Default is to sleep for 5 seconds. Note job output is + send to `/dev/null` by default, as the root user running this has no shared + directory for job output. +- `rebuild_job_reboot`: Bool, whether to add the `--reboot` flag to the job + to actually trigger a rebuild. Useful for e.g. testing priorities. Default + `true`. + +- `rebuild_job_options`: Optional. A string giving any other options to pass to + [sbatch](https://slurm.schedmd.com/sbatch.html). Default is empty string. + +- `rebuild_job_user`: Optional. The user to run the rebuild setup and job as. + Default `root`. + +- `rebuild_job_template`: Optional. The string to use to submit the job. See + [defaults.yml](defaults/main.yml). + +- `rebuild_job_hostlist`: String with a Slurm hostlist expression to restrict + a rebuild to only those nodes (e.g. `tux[1-3]` or `tux1,tux2`). If set, + `rebuild_partitions` must only define a single partition and that partition + must contain those nodes. Not for routine use, but may be useful to e.g. + reattempt a rebuild if this failed on specific nodes. Default is all nodes + in the relevant partition. diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml index 06b237ef2..948283633 100644 --- a/ansible/roles/rebuild/defaults/main.yml +++ b/ansible/roles/rebuild/defaults/main.yml @@ -1,2 +1,23 @@ --- -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file + +rebuild_clouds_path: ~/.config/openstack/clouds.yaml + +rebuild_job_partitions: rebuild +rebuild_job_name: "rebuild-{{ item }}" # item is nodename +rebuild_job_command: 'sleep 5' +rebuild_job_reboot: true +rebuild_job_options: '' +rebuild_job_user: root +rebuild_job_template: >- + sbatch + --nodelist={{ item }} + {{ '--reboot' if rebuild_job_reboot | bool else '' }} + --job-name={{ rebuild_job_name }} + --nodes=1 + --exclusive + --partition={{ _rebuild_job_current_partition }} + --no-requeue + --output=/dev/null + --wrap="{{ rebuild_job_command }}" + {{ rebuild_job_options }} +#rebuild_job_hostlist: \ No newline at end of file diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index c677716c7..5612ab515 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -10,7 +10,7 @@ - name: Copy out clouds.yaml copy: - src: "{{ openhpc_rebuild_clouds }}" + src: "{{ rebuild_clouds_path }}" dest: /etc/openstack/clouds.yaml owner: slurm group: root diff --git a/ansible/roles/rebuild/tasks/rebuild.yml b/ansible/roles/rebuild/tasks/rebuild.yml new file mode 100644 index 000000000..466951f63 --- /dev/null +++ b/ansible/roles/rebuild/tasks/rebuild.yml @@ -0,0 +1,11 @@ +- name: Create rebuild jobs for partition + include_tasks: + file: rebuild_partition.yml + args: + apply: + become: yes + become_user: "{{ rebuild_job_user }}" + loop: "{{ rebuild_job_partitions | split(',') }}" + loop_control: + loop_var: _rebuild_job_current_partition + diff --git a/ansible/roles/rebuild/tasks/rebuild_partition.yml b/ansible/roles/rebuild/tasks/rebuild_partition.yml new file mode 100644 index 000000000..0ce4ee88a --- /dev/null +++ b/ansible/roles/rebuild/tasks/rebuild_partition.yml @@ -0,0 +1,21 @@ +- name: Get list of nodes in partition + ansible.builtin.command: + cmd: >- + sinfo + --Node + --Format=NodeList + --noheader + --partition={{ _rebuild_job_current_partition }} + register: _sinfo_partition + when: rebuild_job_hostlist is not defined + +- name: Expand rebuild_job_hostlist to host names + ansible.builtin.command: + cmd: "scontrol show hostnames {{ rebuild_job_hostlist }}" + register: _scontrol_hostnames + when: rebuild_job_hostlist is defined + +- name: Submit rebuild jobs + ansible.builtin.command: + cmd: "{{ rebuild_job_template }}" + loop: "{{ _scontrol_hostnames.stdout_lines | default(_sinfo_partition.stdout_lines) }}" diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md new file mode 100644 index 000000000..4320071da --- /dev/null +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -0,0 +1,310 @@ +# EXPERIMENTAL: Slurm Controlled Rebuild + +This page describes how to configure the appliance to enable reimaging of +Slurm nodes via submission of Slurm jobs, and how to use that functionality. +This provides a way to upgrade nodes with less impact than the normal approach. + +> [!NOTE] +> This functionality is marked as experimental as the required configuration +> or usage may change with further development. + +In summary, the way this functionality works is as follows: +1. The image references(s) are manually updated in the OpenTofu configuration + in the normal way. +2. `tofu apply` is run which rebuilds the login and control nodes to the new + image(s). The new image reference for compute nodes is ignored, but is + written into the hosts inventory file (and is therefore available as an + Ansible hostvar). +3. The `site.yml` playbook is run which reconfigures the cluster as normal. At + this point the cluster is functional, but using a new image for the login + and control nodes and the old image for the compute nodes. This playbook + also: + - Writes cluster configuration to the control node, using the + [compute_init](../../ansible/roles/compute_init/README.md) role. + - Configures an application credential and helper programs on the control + node, using the [rebuild](../../ansible/roles/rebuild/README.md) role. +4. An admin submits Slurm jobs, one for each node, to a special "rebuild" + partition using an Ansible playbook. Because this partition has higher + priority than the partitions normal users can use, these rebuild jobs become + the next job in the queue for every node (although any jobs currently + running will complete as normal). +5. Because these rebuild jobs have the `--reboot` flag set, before launching them + the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram) + which compares the current image for the node to the one in the cluster + configuration, and if it does not match, uses OpenStack to rebuild the + node to the desired (updated) image. + TODO: Describe the logic if they DO match +6. After a rebuild, the compute node runs various Ansible tasks during boot, + controlled by the [compute_init](../../ansible/roles/compute_init/README.md) + role, to fully configure the node again. It retrieves the required cluster + configuration information from the control node via an NFS mount. +7. Once the `slurmd` daemon starts on a compute node, the slurm controller + registers the node as having finished rebooting. It then launches the actual + job, which does not do anything. + + # TODO: check that this is the LAST thing we do? + + + +TODO: note terraform parallel limits + +nova compute: 10 per nova controller, so either per HV or whole-cloud BM * (different in Caracel onwards). Could tune to 50-100 if properly set. + + +## Prerequsites + +To enable a compute node to rejoin the cluster after a vrebuild, functionality +must be built into the image. Before progressing you should check that all the +functionality required for your cluster is currently supported by the +`compute_init` role. Review that role's [README](../../ansible/roles/compute_init/README.md) +against `environments/*/inventory/groups` files (and any similar files which +define groups). Note that some functionality does not require support, e.g. +because it does not run on compute nodes. + +## Configuration + +The configuration of this is complex and involves: +- OpenTofu variables to stop tracking image changes on compute nodes +- Definition of partition(s) to use for launching rebuild jobs +- Configuration of the [rebuild](../../ansible/roles/rebuild/README.md) role + to enable the Slurm controller to rebuild compute nodes via OpenStack. +- Configuration of the [compute_init](../../ansible/roles/compute_init/README.md) + role so that compute nodes rejoin the cluster after rebuilding - this is likely + to require a custom image build. + +1. Decide on which nodes rebuilding via Slurm should be enabled. These are + referred to as the "rebuildable" nodes below. Generally, this can be all + compute nodes. + +2. Configure OpenTofu not to manage image changes on rebuildable nodes: For each + relevant node group in the OpenTofu `compute` variable, set the + parameter `ignore_image_changes: true`. E.g. + + ```terraform + # environments/$ENV/main.tf: + ... + compute = { + general = { + nodes = ["general-0", "general-1"] + ignore_image_changes: true + } + } + ... + ``` + +3. Follow the [compute_init](../../ansible/roles/compute_init/README.md) README + to add OpenTofu and Ansible configuration for that role. The "rebootable" + nodes should all be in the `compute_init` group with the `compute_init_enable` + OpenTofu parameter set. + +4. If the [compute_init](../../ansible/roles/compute_init/README.md) README + showed that a custom image is required for any entry in the + `compute_init_enable` parameter, follow the usual process to build new + images as required. + +5. Update image references in the OpenTofu configuration. Normally these should + be in: + - `environments/site/tofu/variables.tf`: `cluster_image_id` for the default + cluster image. + - `enviroments/$ENV/tofu/main.tf`: parameter `image_id` in node groups + defined in the `compute` or `login` variables, to override the default + image for specific node groups. + +5. Modify `openhpc_slurm_partitions` to add a new partition covering rebuildable + nodes to use for for rebuild jobs. If using the default OpenTofu + configurations, this variable is contained in an OpenTofu-templated file + `environments/$ENV/group_vars/all/partitions.yml` which must be overriden + by copying it to e.g. a `z_partitions.yml` file in the same directory. + However production sites will probably be overriding this file anyway to + customise it. + + An example partition definition is: + + ```yaml + openhpc_slurm_partitions: + ... + - name: rebuild + groups: + - name: general + default: NO + maxtime: 30 + partition_params: + PriorityJobFactor: 65533 + Hidden: YES + RootOnly: YES + DisableRootJobs: NO + PreemptMode: 'OFF' + OverSubscribe: EXCLUSIVE + ``` + + Which has parameters as follows: + TODO: update me! + - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, + default `rebuild`. + - `groups`: A list of node group names, matching keys in the OpenTofu `compute` + variable (see example configuration above). See discussion below. + - `default`: Must be set to `NO` so that it is not the default partition. + - `maxtime`: Maximum time to allow for rebuild jobs, in + [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). + The example here is 30 minutes, but see discussion below + - `partition_params`: A mapping of additional parameters, which must be set + as follows: + - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs) + are always scheduled before jobs in "normal" partitions on the same + nodes. This value is the highest which can be set. See + [slurm.conf docs](https://slurm.schedmd.com/slurm.conf.html#OPT_PriorityJobFactor). + Note this is used instead of `PriorityTier` as the latter (with the + default appliance configuration) allows rebuild jobs to preempt and + suspend running user jobs, which is probably undesirable. + - `Hidden`: Don't show this partition in e.g. `sinfo` for unpriviledged + users. + - `RootOnly`: Only allow the root user to submit jobs to this partition. + - `DisableRootJobs`: Don't disable the root user, in case this parameter + is set globally via `openhpc_config_extra`. + - `PreemptMode`: Don't allow reboot jobs to be preempted/suspended. + - `OverSubscribe`: Ensure that jobs run in this partition require the + entire node. This means they do not run on nodes as the same time as + user jobs running in partitions allowing non-exclusive use. + + Note that this partition overlaps with "normal" partitions. If it is + desirable to roll out changes more gradually, it is possible to create + multiple "rebuild" partitions, but it is necessary that: + - The rebuild partitions should not themselves overlap, else nodes may be + rebuilt more than once. + - Each rebuild partition should entirely cover one or more "normal" + partitions, to avoid the possibility of user jobs being scheduled to a + mix of nodes using old and new images. + +6. Configure the [rebuild](../../ansible/roles/rebuild/README.md) role: + - Add the `control` node into the `rebuild` group. + - Ensure an application credential to use for rebuilding nodes is available + on the deploy host (default location `~/.config/openstack/clouds.yaml`). + If not using that location override `rebuild_clouds`. + - **TODO:** CONFIGURE rebuild job defaults! + +7. Run `tofu apply` as usual to apply the new OpenTofu configuration. + + > [!NOTE] + > If the cluster image references were updated at step 5, this will be + > a disruptive operation and should be planned as part of a normal upgrade + > cycle. + + > [!CAUTION] + > Due to OpenTofu/Terraform state limitations, this will plan to delete and + > recreate all compute nodes in node groups where `ignore_image_changes: true`. + > was not previously set. This is a one-time issue with adding this + > parameter, i.e. subsequent applys will not require this. + +TODO: clarify whether, if the image is bumped at this point, the compute nodes +actually get recreated on the new or the old image?? + +8. Run the `site.yml` playbook as normal to configure the cluster. + +The cluster is now ready to perform slurm-controlled upgrades as described in +the next section. + +## Operations with Slurm-controlled Rebuilds + +This section describes how to trigger and control Slurm-controlled rebuilds. +However in general these are likely to be done as part of a general cluster +upgrade. As described in the introduction to this page that will involve +rebuilding the login and control nodes to the new image then re-running the +`site.yml` playbook to reconfigure the cluster. That process is disruptive in +that users have no access via SSH or Open Ondemand while it is occuring. +However there is no need to drain compute nodes and create reservations etc. + +Triggering rebuild jobs is done using the following playbook: + + ansible-playbook ansible/adhoc/rebuild-via-slurm.yml + +This will create jobs to reimage every slurm-rebuildable node to the image +currently defined in the OpenTofu configuration. + +Note that some of the [rebuild role variables](../../ansible/roles/rebuild/README.md) +may also be useful as extravars, especially for testing or debugging. + +## Testing + +The below demonstrates testing this using the `.stackhpc` CI environment, using: +- A 2-node default "standard" partition. +- A 2-node "extra" partition (note this does not usually have any nodes by default). + +In one terminal launch a watch of job state: + + [root@RL9-control rocky]# clear && ~/ewatch/ewatch.py -n 1 -i '\d+:\d+' 'squeue --all --Format=PARTITION,NAME:25,USERNAME:11,STATE:12,NUMNODES:8,NODELIST' + +This uses [ewatch](https://github.com/sjpb/ewatch) to summarise changes in +output. + +In a second terminal, launch 2x normal jobs into the default ("standard") +partition: + + [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobA --wrap "sleep 20" && sbatch -N2 --job-name=JobB --wrap "sleep 10" + +In a third terminal, trigger rebuild jobs: + + .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_reboot=false rebuild_job_command="sleep 30"' - + +Back in the second terminal, submit more user jobs to either partition: + + [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobC --partition,standard,extra --wrap "sleep 10" + +The output from the first terminal should show: +- Job A runs on submission in the default "standard" partition. +- Job B pends for the default "standard" partition. +- Rebuild jobs runs on submission in the "extra" partition and pend for the "standard" partition +- Job C pends for both partitions +- Job A completes +- Rebuild jobs run on the "standard" partition, jumping ahead of JobB and JobC +- Rebuild jobs complete in the "extra" paritition +- JobC runs in the "extra" partition +- JobC completes +- Rebuild jobs complete in the "standard" partition +- Job B runs in the "standard" partition + +Example output: +``` +[2025-03-28T14:26:34.510466] +PARTITION NAME USER STATE NODES NODELIST +standard JobB demo_user PENDING 2 +standard JobA demo_user RUNNING 2 RL9-compute-[0-1] + +[2025-03-28T14:26:38.530213] +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-1 root PENDING 1 +rebuild rebuild-RL9-compute-0 root PENDING 1 +rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 +rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 +standard JobB demo_user PENDING 2 +standard JobA demo_user RUNNING 2 RL9-compute-[0-1] +standard,extra JobC demo_user PENDING 2 + +[2025-03-28T14:26:54.609651] +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 +rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 +standard JobB demo_user PENDING 2 +standard,extra JobC demo_user PENDING 2 + +[2025-03-28T14:28:39.091571] +PARTITION NAME USER STATE NODES NODELIST +extra JobC demo_user RUNNING 2 RL9-extra-[0-1] +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +standard JobB demo_user PENDING 2 + +[2025-03-28T14:28:49.139349] +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +standard JobB demo_user PENDING 2 + +[2025-03-28T14:28:55.168264] +PARTITION NAME USER STATE NODES NODELIST +standard JobB demo_user RUNNING 2 RL9-compute-[0-1] + +[2025-03-28T14:29:05.216346] +PARTITION NAME USER STATE NODES NODELIST +``` From b1639d0cd83912fe34b1ff60da32c9fb03101084 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 14:53:23 +0000 Subject: [PATCH 02/15] add (normally-empty) second partition for stackhpc --- environments/.stackhpc/tofu/main.tf | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index 7c6b774fb..8d78401bf 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -81,11 +81,12 @@ module "cluster" { compute_init_enable: ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts"] ignore_image_changes: true } - # Example of how to add another partition: - # extra: { - # nodes: ["compute-2", "compute-3"] - # flavor: var.other_node_flavor - # } + # Normally-empty partition for testing: + extra: { + nodes: [] + #nodes: ["extra-0", "extra-1"] + flavor: var.other_node_flavor + } } volume_backed_instances = var.volume_backed_instances From 2d287cbea674a1faa074a6276935e39809e4a27d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 14:59:44 +0000 Subject: [PATCH 03/15] define rebuild partition for stackhpc --- .../inventory/group_vars/all/z_partitions.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 environments/.stackhpc/inventory/group_vars/all/z_partitions.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml b/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml new file mode 100755 index 000000000..ea489770e --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml @@ -0,0 +1,18 @@ +# override tofu-generated file: +openhpc_slurm_partitions: + - name: extra + - name: standard + - name: rebuild + groups: + - name: extra + - name: standard + default: NO + maxtime: 30 + partition_params: + PriorityJobFactor: 65533 + Hidden: YES + RootOnly: YES + DisableRootJobs: NO + PreemptMode: 'OFF' + OverSubscribe: EXCLUSIVE + From 5c3c90eddeb9c860c847c38e334f23b8dfafaba0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 15:00:48 +0000 Subject: [PATCH 04/15] add leafcloud-dev tf vars file --- environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars diff --git a/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars b/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars new file mode 100644 index 000000000..82e336dd8 --- /dev/null +++ b/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars @@ -0,0 +1,10 @@ +cluster_networks = [ + { + network = "stackhpc-dev" + subnet = "stackhpc-dev" + } +] +control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment +other_node_flavor = "en1.xsmall" +state_volume_type = "unencrypted" +home_volume_type = "unencrypted" From 3b79b77a20ca6303f8d221b3ad672b70473b7201 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 15:07:03 +0000 Subject: [PATCH 05/15] rebuild docs tweak --- docs/experimental/slurm-controlled-rebuild.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 4320071da..53de62bc6 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -221,7 +221,12 @@ This will create jobs to reimage every slurm-rebuildable node to the image currently defined in the OpenTofu configuration. Note that some of the [rebuild role variables](../../ansible/roles/rebuild/README.md) -may also be useful as extravars, especially for testing or debugging. +may also be useful as extravars, especially for testing or debugging. For +example the following comand will run in a non-default partition and does not +actually reboot/rebuild nodes, which may be useful for testing interactions with +other priority or QOS settings: + + ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_partitions=test rebuild_job_reboot=false' ## Testing From af9aa52a4eb9c103a601a3dea5c2ac460c8f451f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 15:16:04 +0000 Subject: [PATCH 06/15] ensure user limits via slurm are in place before starting slurmd --- .../roles/compute_init/files/compute-init.yml | 13 ++++++------ ansible/slurm.yml | 20 +++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 25af01154..06d62c798 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -324,12 +324,6 @@ enabled: true state: started - - name: Ensure slurmd service state - service: - name: slurmd - enabled: true - state: started - - name: Set locked memory limits on user-facing nodes lineinfile: path: /etc/security/limits.conf @@ -351,6 +345,13 @@ +:adm:ALL -:ALL:ALL + - name: Ensure slurmd service state + service: + name: slurmd + enabled: true + state: started + + - name: Ensure node is resumed # TODO: consider if this is always safe for all job states? command: scontrol update state=resume nodename={{ ansible_hostname }} diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 80812ae7d..d1bb93a9f 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -19,16 +19,6 @@ - import_role: name: rebuild -- name: Setup slurm - hosts: openhpc - become: yes - tags: - - openhpc - tasks: - - include_role: - name: stackhpc.openhpc - tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - - name: Set locked memory limits on user-facing nodes hosts: - compute @@ -63,3 +53,13 @@ +:adm:ALL -:ALL:ALL # vagrant uses (deprecated) ansible_ssh_user + +- name: Setup slurm + hosts: openhpc + become: yes + tags: + - openhpc + tasks: + - include_role: + name: stackhpc.openhpc + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" From 25fb17d9214494a4f9e4347e1ae353e35ad3bbf7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 15:42:38 +0000 Subject: [PATCH 07/15] add docs re. parellelism --- ansible/roles/rebuild/README.md | 6 +-- docs/experimental/slurm-controlled-rebuild.md | 37 ++++++++++++------- docs/production.md | 19 ++++++++++ 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index ddce17933..58cb26502 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -34,9 +34,9 @@ running the `ansible/adhoc/rebuild-via-slurm.yml` playbook: send to `/dev/null` by default, as the root user running this has no shared directory for job output. -- `rebuild_job_reboot`: Bool, whether to add the `--reboot` flag to the job - to actually trigger a rebuild. Useful for e.g. testing priorities. Default - `true`. +- `rebuild_job_reboot`: Optional. A bool controlling whether to add the + `--reboot` flag to the job to actually trigger a rebuild. Useful for e.g. + testing partition configurations. Default `true`. - `rebuild_job_options`: Optional. A string giving any other options to pass to [sbatch](https://slurm.schedmd.com/sbatch.html). Default is empty string. diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 53de62bc6..f7aae3ced 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -42,9 +42,6 @@ In summary, the way this functionality works is as follows: registers the node as having finished rebooting. It then launches the actual job, which does not do anything. - # TODO: check that this is the LAST thing we do? - - TODO: note terraform parallel limits @@ -86,7 +83,13 @@ The configuration of this is complex and involves: compute = { general = { nodes = ["general-0", "general-1"] - ignore_image_changes: true + ignore_image_changes = true + ... + } + gpu = { + node = ["a100-0", "a100-1"] + ignore_image_changes = true + ... } } ... @@ -118,7 +121,8 @@ The configuration of this is complex and involves: However production sites will probably be overriding this file anyway to customise it. - An example partition definition is: + An example partition definition, given the two node groups "general" and + "gpu" shown in Step 2, is: ```yaml openhpc_slurm_partitions: @@ -126,6 +130,7 @@ The configuration of this is complex and involves: - name: rebuild groups: - name: general + - name: gpu default: NO maxtime: 30 partition_params: @@ -138,15 +143,16 @@ The configuration of this is complex and involves: ``` Which has parameters as follows: - TODO: update me! - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, default `rebuild`. - - `groups`: A list of node group names, matching keys in the OpenTofu `compute` - variable (see example configuration above). See discussion below. + - `groups`: A list of node group names, matching keys in the OpenTofu + `compute` variable (see example in step 2 above). Normally every compute + node group should be listed here, unless Slurm-controlled rebuild is not + required for certain node groups. - `default`: Must be set to `NO` so that it is not the default partition. - `maxtime`: Maximum time to allow for rebuild jobs, in [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). - The example here is 30 minutes, but see discussion below + The example here is 30 minutes, but see discussion below. - `partition_params`: A mapping of additional parameters, which must be set as follows: - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs) @@ -166,9 +172,12 @@ The configuration of this is complex and involves: entire node. This means they do not run on nodes as the same time as user jobs running in partitions allowing non-exclusive use. - Note that this partition overlaps with "normal" partitions. If it is - desirable to roll out changes more gradually, it is possible to create - multiple "rebuild" partitions, but it is necessary that: + The value for `maxtime` needs to be sufficent not just for a single node + to be rebuilt, but also to allow for any batching in either OpenTofu or + in Nova - see remarks in the [production docs](../production.md). + + If it is desirable to roll out changes more gradually, it is possible to + create multiple "rebuild" partitions, but it is necessary that: - The rebuild partitions should not themselves overlap, else nodes may be rebuilt more than once. - Each rebuild partition should entirely cover one or more "normal" @@ -179,8 +188,8 @@ The configuration of this is complex and involves: - Add the `control` node into the `rebuild` group. - Ensure an application credential to use for rebuilding nodes is available on the deploy host (default location `~/.config/openstack/clouds.yaml`). - If not using that location override `rebuild_clouds`. - - **TODO:** CONFIGURE rebuild job defaults! + - If required, override `rebuild_clouds_path` or other variables in the site + environment. 7. Run `tofu apply` as usual to apply the new OpenTofu configuration. diff --git a/docs/production.md b/docs/production.md index 7876af126..c15298887 100644 --- a/docs/production.md +++ b/docs/production.md @@ -130,3 +130,22 @@ and referenced from the `site` and `production` environments, e.g.: - See the [hpctests docs](../ansible/roles/hpctests/README.md) for advice on raising `hpctests_hpl_mem_frac` during tests. + +- By default, OpenTofu (and Terraform) [limits](https://opentofu.org/docs/cli/commands/apply/#apply-options) + the number of concurrent operations to 10. This means that for example only + 10 ports or 10 instances can be deployed at once. This should be raised by + modifying `environments/$ENV/activate` to add a line like: + + export TF_CLI_ARGS_apply="-parallelism=25" + + The value chosen should be the highest value demonstrated during testing. + Note that any time spent blocked due to this parallelism limit does not count + against the (un-overridable) internal OpenTofu timeout of 30 minutes + +- By default, OpenStack Nova also [limits](https://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.max_concurrent_builds) + the number of concurrent instance builds to 10. This is per Nova controller, + so 10x virtual machines per hypervisor. For baremetal nodes it is 10 per cloud + if the OpenStack version is earlier than Caracel, else this limit can be + raised using [shards](https://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/ironic-shards.html). + In general it should be possible to raise this value to 50-100 if the cloud + is properly tuned, again, demonstrated through testing. From f4a5fb31197ddf463fd4d2a360f162513b04680a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 15:43:14 +0000 Subject: [PATCH 08/15] update expected stackhpc sinfo for empty 'extra' partition --- ansible/ci/check_slurm.yml | 1 + docs/experimental/slurm-controlled-rebuild.md | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index 4d77c6fb5..00e8d62cf 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -21,4 +21,5 @@ vars: expected_sinfo: + - " extra up 60-00:00:00 0 n/a" # empty partition - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle" diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index f7aae3ced..ca8081079 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -42,12 +42,6 @@ In summary, the way this functionality works is as follows: registers the node as having finished rebooting. It then launches the actual job, which does not do anything. - -TODO: note terraform parallel limits - -nova compute: 10 per nova controller, so either per HV or whole-cloud BM * (different in Caracel onwards). Could tune to 50-100 if properly set. - - ## Prerequsites To enable a compute node to rejoin the cluster after a vrebuild, functionality From 5d50649393c9eaebbe52f35e5e5f8dc99f18328a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Mar 2025 15:43:53 +0000 Subject: [PATCH 09/15] use new rebuild adhoc for CI --- .github/workflows/stackhpc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 4c7e28b2b..348edff35 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -173,11 +173,11 @@ jobs: ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Reimage compute nodes to image in current branch using slurm - tests compute-init + - name: Reimage compute nodes to image in current branch using slurm run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml + ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage to current branch From 916460bb378d9424fb005c7c3ed6cb845b52edb3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Apr 2025 09:00:29 +0000 Subject: [PATCH 10/15] fixup empty partition for CI check --- ansible/ci/check_slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index 00e8d62cf..feac502ed 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -21,5 +21,5 @@ vars: expected_sinfo: - - " extra up 60-00:00:00 0 n/a" # empty partition + - " extra up 60-00:00:00 0 n/a" # empty partition - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle" From d01dc4097b1d11e66496234cbbdb5bddd6d55db1 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 2 Apr 2025 10:26:56 +0100 Subject: [PATCH 11/15] Fix typo in docs/experimental/slurm-controlled-rebuild.md Co-authored-by: Will Szumski --- docs/experimental/slurm-controlled-rebuild.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index ca8081079..18ad42042 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -44,7 +44,7 @@ In summary, the way this functionality works is as follows: ## Prerequsites -To enable a compute node to rejoin the cluster after a vrebuild, functionality +To enable a compute node to rejoin the cluster after a rebuild, functionality must be built into the image. Before progressing you should check that all the functionality required for your cluster is currently supported by the `compute_init` role. Review that role's [README](../../ansible/roles/compute_init/README.md) From d42c6bac996b642bd88f059eec9c81104c375fa2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 4 Apr 2025 15:45:14 +0000 Subject: [PATCH 12/15] fix long nodenames getting truncated when listing partitions --- ansible/roles/rebuild/tasks/rebuild_partition.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/rebuild/tasks/rebuild_partition.yml b/ansible/roles/rebuild/tasks/rebuild_partition.yml index 0ce4ee88a..3b319e6e2 100644 --- a/ansible/roles/rebuild/tasks/rebuild_partition.yml +++ b/ansible/roles/rebuild/tasks/rebuild_partition.yml @@ -3,7 +3,7 @@ cmd: >- sinfo --Node - --Format=NodeList + --format=%N --noheader --partition={{ _rebuild_job_current_partition }} register: _sinfo_partition From 1343125d808d53d2cd294d3ee3d581f1889370dc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Apr 2025 09:10:44 +0000 Subject: [PATCH 13/15] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index ea8d46e4a..814769ed1 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250401-1100-9a3cffdb", - "RL9": "openhpc-RL9-250401-1100-9a3cffdb" + "RL8": "openhpc-RL8-250408-0812-c3b68b9c", + "RL9": "openhpc-RL9-250408-0813-c3b68b9c" } } From 1087f19275905207b922da62f54d8a17d41edc61 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Apr 2025 09:11:20 +0000 Subject: [PATCH 14/15] cope with nodes still running reboot job in check_slurm playbook --- ansible/ci/check_slurm.yml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index feac502ed..8d2b08350 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,19 +6,9 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout or "down" in sinfo.stdout) + until: sinfo.stdout_lines == expected_sinfo retries: 10 delay: 5 - - name: Check nodes have expected slurm state - assert: - that: sinfo.stdout_lines == expected_sinfo - fail_msg: | - sinfo output not as expected: - actual: - {{ sinfo.stdout_lines }} - expected: - {{ expected_sinfo }} - vars: expected_sinfo: - " extra up 60-00:00:00 0 n/a" # empty partition From 6c65f72bf33a52961be405346e17a9c247b0f780 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Apr 2025 11:02:03 +0000 Subject: [PATCH 15/15] increase retries when checking slurm state in CI --- ansible/ci/check_slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index 8d2b08350..ff527da06 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -7,7 +7,7 @@ register: sinfo changed_when: false until: sinfo.stdout_lines == expected_sinfo - retries: 10 + retries: 200 delay: 5 vars: expected_sinfo: