Skip to content

Commit b7d9c48

Browse files
committed
merge
2 parents acf0c0d + 5b43d0e commit b7d9c48

File tree

8 files changed

+110
-68
lines changed

8 files changed

+110
-68
lines changed

.github/workflows/nightlybuild.yml

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -108,68 +108,7 @@ jobs:
108108
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
109109
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
110110
111-
- name: Download image
112-
run: |
113-
. venv/bin/activate
114-
sudo mkdir /mnt/images
115-
sudo chmod 777 /mnt/images
116-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
117-
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }}
118-
119-
- name: Set up QEMU
120-
uses: docker/setup-qemu-action@v3
121-
122-
- name: install libguestfs
123-
run: |
124-
sudo apt -y update
125-
sudo apt -y install libguestfs-tools
126-
127-
- name: mkdir for mount
128-
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
129-
130-
- name: mount qcow2 file
131-
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
132-
133-
- name: Run Trivy vulnerability scanner
134-
uses: aquasecurity/[email protected]
135-
with:
136-
scan-type: fs
137-
scan-ref: "${{ steps.manifest.outputs.image-name }}"
138-
scanners: "vuln"
139-
format: sarif
140-
output: "${{ steps.manifest.outputs.image-name }}.sarif"
141-
# turn off secret scanning to speed things up
142-
env:
143-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
144-
145-
- name: Upload Trivy scan results to GitHub Security tab
146-
uses: github/codeql-action/upload-sarif@v3
147-
with:
148-
sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
149-
category: "${{ matrix.os_version }}-${{ matrix.build }}"
150-
151-
- name: Fail if scan has CRITICAL vulnerabilities
152-
uses: aquasecurity/[email protected]
153-
with:
154-
scan-type: fs
155-
scan-ref: "${{ steps.manifest.outputs.image-name }}"
156-
scanners: "vuln"
157-
format: table
158-
exit-code: '1'
159-
severity: 'CRITICAL'
160-
ignore-unfixed: true
161-
env:
162-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
163-
164-
- name: Delete new image if Trivy scan fails
165-
if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed
166-
run: |
167-
. venv/bin/activate
168-
echo "Deleting new image due to critical vulnerabilities or scan failure ..."
169-
openstack image delete "${{ steps.manifest.outputs.image-id }}"
170-
171111
- name: Delete old latest image
172-
if: success() # Runs only if Trivy scan passed
173112
run: |
174113
. venv/bin/activate
175114
IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l)

ansible/bootstrap.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,3 +266,4 @@
266266
tasks:
267267
- ansible.builtin.include_role:
268268
name: k3s
269+
tasks_from: install.yml

ansible/roles/cluster_infra/templates/outputs.tf.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ output "cluster_nodes" {
2424
}
2525
},
2626
{
27-
name = openstack_compute_instance_v2.control["control"].name
28-
ip = openstack_compute_instance_v2.control["control"].network[0].fixed_ip_v4
27+
name = openstack_compute_instance_v2.control.name
28+
ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
2929
groups = ["control", "{{ cluster_name }}_control"],
3030
facts = {
3131
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" {
399399
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
400400
{% endif %}
401401
{% endfor %}
402-
k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]
402+
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
403403
k3s_token = "{{ k3s_token }}"
404404
}
405405
}
@@ -412,7 +412,6 @@ resource "openstack_compute_instance_v2" "control" {
412412
{% else %}
413413
flavor_id = "{{ control_flavor }}"
414414
{% endif %}
415-
for_each = toset(["control"])
416415

417416
network {
418417
port = openstack_networking_port_v2.control.id
@@ -566,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
566565
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
567566
{% endif %}
568567
{% endfor %}
569-
k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]
568+
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
570569
k3s_token = "{{ k3s_token }}"
571570
}
572571
}
File renamed without changes.

ansible/roles/passwords/tasks/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
run_once: true
99

1010
- name: Get templated passwords from target environment
11+
# inventory group/host vars created in a play cannot be accessed in the same play, even after meta: refresh_inventory
1112
ansible.builtin.include_vars:
1213
file: "{{ openhpc_passwords_output_path }}"
13-
name: templated_secrets
1414

1515
- name: Template k3s token to terraform
1616
template:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"k3s_token": "{{ templated_secrets.vault_k3s_token }}"
2+
"k3s_token": "{{ vault_k3s_token }}"
33
}

docs/upgrades.md

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Upgrades
2+
3+
This document explains the generic steps required to upgrade a deployment of the Slurm Appliance with upstream changes from StackHPC.
4+
Generally, upstream releases will happen roughly monthly. Releases may contain new functionality and/or updated images.
5+
6+
Any site-specific instructions in [docs/site/README.md](site/README.md) should be reviewed in tandem with this.
7+
8+
This document assumes the deployment repository has:
9+
1. Remotes:
10+
- `origin` referring to the site-specific remote repository.
11+
- `stackhpc` referring to the StackHPC repository at https://github.com/stackhpc/ansible-slurm-appliance.git.
12+
2. Branches:
13+
- `main` - following `main/origin`, the current site-specific code deployed to production.
14+
- `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`.
15+
3. The following environments:
16+
- `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`.
17+
- `$STAGING`: a production environment, as defined by e.g. `environments/staging/`.
18+
- `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`.
19+
20+
**NB:** Commands which should be run on the Slurm login node are shown below prefixed `[LOGIN]$`.
21+
All other commands should be run on the Ansible deploy host.
22+
23+
1. Update the `upstream` branch from the `stackhpc` remote, including tags:
24+
25+
git fetch stackhpc main --tags
26+
27+
1. Identify the latest release from the [Slurm appliance release page](https://github.com/stackhpc/ansible-slurm-appliance/releases). Below this release is shown as `vX.Y`.
28+
29+
1. Ensure your local site branch is up to date and create a new branch from it for the
30+
site-specfic release code:
31+
32+
git checkout main
33+
git pull --prune
34+
git checkout -b update/vX.Y
35+
36+
1. Merge the upstream code into your release branch:
37+
38+
git merge vX.Y
39+
40+
It is possible this will introduce merge conflicts; fix these following the usual git
41+
prompts. Generally merge conflicts should only exist where functionality which was added
42+
for your site (not in a hook) has subsequently been merged upstream.
43+
44+
1. Push this branch and create a PR:
45+
46+
git push
47+
# follow instructions
48+
49+
1. Review the PR to see if any added/changed functionality requires alteration of
50+
site-specific configuration. In general changes to existing functionality will aim to be
51+
backward compatible. Alteration of site-specific configuration will usually only be
52+
necessary to use new functionality or where functionality has been upstreamed as above.
53+
54+
Make changes as necessary.
55+
56+
1. Identify image(s) from the relevant [Slurm appliance release](https://github.com/stackhpc/ansible-slurm-appliance/releases), and download
57+
using the link on the release plus the image name, e.g. for an image `openhpc-ofed-RL8-240906-1042-32568dbb`:
58+
59+
wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/openhpc-ofed-RL8-240906-1042-32568dbb
60+
61+
Note that some releases may not include new images. In this case use the image from the latest previous release with new images.
62+
63+
1. If required, build an "extra" image with local modifications, see [docs/image-build.md](./image-build.md).
64+
65+
1. Modify your site-specific environment to use this image, e.g. via `cluster_image_id` in `environments/$SITE_ENV/terraform/variables.tf`.
66+
67+
1. Test this in your staging cluster.
68+
69+
1. Commit changes and push to the PR created above.
70+
71+
1. Declare a future outage window to cluster users. A [Slurm reservation](https://slurm.schedmd.com/scontrol.html#lbAQ) can be
72+
used to prevent jobs running during that window, e.g.:
73+
74+
[LOGIN]$ sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root
75+
76+
Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits).
77+
78+
1. At the outage window, check there are no jobs running:
79+
80+
[LOGIN]$ squeue
81+
82+
1. Deploy the branch created above to production, i.e. activate the production environment, run OpenTofu to reimage or
83+
delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml`
84+
playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md).
85+
86+
1. Check slurm is up:
87+
88+
[LOGIN]$ sinfo -R
89+
90+
The `-R` shows the reason for any nodes being down.
91+
92+
1. If the above shows nodes done for having been "unexpectedly rebooted", set them up again:
93+
94+
[LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR
95+
96+
where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition.
97+
98+
1. Delete the reservation:
99+
100+
[LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160"
101+
102+
1. Tell users the cluster is available again.
103+

0 commit comments

Comments
 (0)