merge

wtripp180901 · wtripp180901 · commit b7d9c489b3ce · 2024-11-13T11:45:01.000Z
diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml
@@ -108,68 +108,7 @@ jobs:
           echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
           echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
 
-      - name: Download image
-        run: |
-          . venv/bin/activate
-          sudo mkdir /mnt/images
-          sudo chmod 777 /mnt/images
-          openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
-          openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: install libguestfs
-        run: |
-          sudo apt -y update
-          sudo apt -y install libguestfs-tools
-
-      - name: mkdir for mount
-        run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
-
-      - name: mount qcow2 file
-        run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
-
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@0.17.0
-        with:
-          scan-type: fs
-          scan-ref: "${{ steps.manifest.outputs.image-name }}"
-          scanners: "vuln"
-          format: sarif
-          output: "${{ steps.manifest.outputs.image-name }}.sarif"
-          # turn off secret scanning to speed things up
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@v3
-        with:
-          sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
-          category: "${{ matrix.os_version }}-${{ matrix.build }}"
-
-      - name: Fail if scan has CRITICAL vulnerabilities
-        uses: aquasecurity/trivy-action@0.16.1
-        with:
-          scan-type: fs
-          scan-ref: "${{ steps.manifest.outputs.image-name }}"
-          scanners: "vuln"
-          format: table
-          exit-code: '1'
-          severity: 'CRITICAL'
-          ignore-unfixed: true
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Delete new image if Trivy scan fails
-        if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed
-        run: |
-          . venv/bin/activate
-          echo "Deleting new image due to critical vulnerabilities or scan failure ..."
-          openstack image delete "${{ steps.manifest.outputs.image-id }}"
-
       - name: Delete old latest image
-        if: success() # Runs only if Trivy scan passed
         run: |
           . venv/bin/activate
           IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l)
diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
@@ -266,3 +266,4 @@
   tasks:
     - ansible.builtin.include_role:
         name: k3s
+        tasks_from: install.yml
diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2
@@ -24,8 +24,8 @@ output "cluster_nodes" {
         }
       },
       {
-        name          = openstack_compute_instance_v2.control["control"].name
-        ip            = openstack_compute_instance_v2.control["control"].network[0].fixed_ip_v4
+        name          = openstack_compute_instance_v2.control.name
+        ip            = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
         groups        = ["control", "{{ cluster_name }}_control"],
         facts  = {
           openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" {
         ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
       {% endif %}
     {% endfor %} 
-    k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]
+    k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
     k3s_token = "{{ k3s_token }}"
   }
 }
@@ -412,7 +412,6 @@ resource "openstack_compute_instance_v2" "control" {
   {% else %}
   flavor_id = "{{ control_flavor }}"
   {% endif %}
-  for_each = toset(["control"])
 
   network {
     port = openstack_networking_port_v2.control.id
@@ -566,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
         ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
       {% endif %}
     {% endfor %} 
-    k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]
+    k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
     k3s_token = "{{ k3s_token }}"
   }
 }
diff --git a/ansible/roles/k3s/tasks/install.yml b/ansible/roles/k3s/tasks/install.yml
diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml
@@ -8,9 +8,9 @@
   run_once: true
 
 - name: Get templated passwords from target environment
+# inventory group/host vars created in a play cannot be accessed in the same play, even after meta: refresh_inventory
   ansible.builtin.include_vars:
     file: "{{ openhpc_passwords_output_path }}"
-    name: templated_secrets
 
 - name:  Template k3s token to terraform
   template:
diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2
@@ -1,3 +1,3 @@
 {
-    "k3s_token": "{{ templated_secrets.vault_k3s_token }}"
+    "k3s_token": "{{ vault_k3s_token }}"
 }
diff --git a/docs/upgrades.md b/docs/upgrades.md
@@ -0,0 +1,103 @@
+# Upgrades
+
+This document explains the generic steps required to upgrade a deployment of the Slurm Appliance with upstream changes from StackHPC.
+Generally, upstream releases will happen roughly monthly. Releases may contain new functionality and/or updated images.
+
+Any site-specific instructions in [docs/site/README.md](site/README.md) should be reviewed in tandem with this.
+
+This document assumes the deployment repository has:
+1. Remotes:
+    - `origin` referring to the site-specific remote repository.
+    - `stackhpc` referring to the StackHPC repository at https://github.com/stackhpc/ansible-slurm-appliance.git.
+2. Branches:
+    - `main` - following `main/origin`, the current site-specific code deployed to production.
+    - `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`.
+3. The following environments:
+    - `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`.
+    - `$STAGING`: a production environment, as defined by e.g. `environments/staging/`.
+    - `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`.
+
+**NB:** Commands which should be run on the Slurm login node are shown below prefixed `[LOGIN]$`.
+All other commands should be run on the Ansible deploy host.
+
+1. Update the `upstream` branch from the `stackhpc` remote, including tags:
+
+        git fetch stackhpc main --tags
+
+1. Identify the latest release from the [Slurm appliance release page](https://github.com/stackhpc/ansible-slurm-appliance/releases). Below this release is shown as `vX.Y`.
+
+1. Ensure your local site branch is up to date and create a new branch from it for the
+   site-specfic release code:
+
+        git checkout main
+        git pull --prune
+        git checkout -b update/vX.Y
+
+1. Merge the upstream code into your release branch:
+
+        git merge vX.Y
+
+   It is possible this will introduce merge conflicts; fix these following the usual git 
+   prompts. Generally merge conflicts should only exist where functionality which was added
+   for your site (not in a hook) has subsequently been merged upstream.
+
+1. Push this branch and create a PR:
+
+        git push
+        # follow instructions
+
+1. Review the PR to see if any added/changed functionality requires alteration of
+   site-specific configuration. In general changes to existing functionality will aim to be
+   backward compatible. Alteration of site-specific configuration will usually only be
+   necessary to use new functionality or where functionality has been upstreamed as above.
+
+   Make changes as necessary.
+
+1. Identify image(s) from the relevant [Slurm appliance release](https://github.com/stackhpc/ansible-slurm-appliance/releases), and download
+   using the link on the release plus the image name, e.g. for an image `openhpc-ofed-RL8-240906-1042-32568dbb`:
+
+        wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/openhpc-ofed-RL8-240906-1042-32568dbb
+
+    Note that some releases may not include new images. In this case use the image from the latest previous release with new images.
+
+1. If required, build an "extra" image with local modifications, see [docs/image-build.md](./image-build.md).
+
+1. Modify your site-specific environment to use this image, e.g. via `cluster_image_id` in `environments/$SITE_ENV/terraform/variables.tf`.
+
+1. Test this in your staging cluster.
+
+1. Commit changes and push to the PR created above.
+
+1. Declare a future outage window to cluster users. A [Slurm reservation](https://slurm.schedmd.com/scontrol.html#lbAQ) can be
+   used to prevent jobs running during that window, e.g.:
+
+        [LOGIN]$  sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root
+
+   Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits).
+
+1. At the outage window, check there are no jobs running:
+
+        [LOGIN]$ squeue
+
+1. Deploy the branch created above to production, i.e. activate the production environment, run OpenTofu to reimage or
+delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml`
+playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md).
+
+1. Check slurm is up:
+
+        [LOGIN]$ sinfo -R
+   
+   The `-R` shows the reason for any nodes being down.
+
+1. If the above shows nodes done for having been "unexpectedly rebooted", set them up again:
+
+        [LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR
+
+    where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition.
+
+1. Delete the reservation:
+
+        [LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160"
+
+1. Tell users the cluster is available again.
+

Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,8 @@ output "cluster_nodes" {`
`24`	`24`	`}`
`25`	`25`	`},`
`26`	`26`	`{`
`27`		`- name = openstack_compute_instance_v2.control["control"].name`
`28`		`- ip = openstack_compute_instance_v2.control["control"].network[0].fixed_ip_v4`
	`27`	`+ name = openstack_compute_instance_v2.control.name`
	`28`	`+ ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4`
`29`	`29`	`groups = ["control", "{{ cluster_name }}_control"],`
`30`	`30`	`facts = {`
`31`	`31`	`openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id`
Original file line number	Diff line number	Diff line change
`@@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" {`
`399`	`399`	`ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"`
`400`	`400`	`{% endif %}`
`401`	`401`	`{% endfor %}`
`402`		`- k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]`
	`402`	`+ k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4`
`403`	`403`	`k3s_token = "{{ k3s_token }}"`
`404`	`404`	`}`
`405`	`405`	`}`
`@@ -412,7 +412,6 @@ resource "openstack_compute_instance_v2" "control" {`
`412`	`412`	`{% else %}`
`413`	`413`	`flavor_id = "{{ control_flavor }}"`
`414`	`414`	`{% endif %}`
`415`		`- for_each = toset(["control"])`
`416`	`415`
`417`	`416`	`network {`
`418`	`417`	`port = openstack_networking_port_v2.control.id`
`@@ -566,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {`
`566`	`565`	`ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"`
`567`	`566`	`{% endif %}`
`568`	`567`	`{% endfor %}`
`569`		`- k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]`
	`568`	`+ k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4`
`570`	`569`	`k3s_token = "{{ k3s_token }}"`
`571`	`570`	`}`
`572`	`571`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "k3s_token": "{{ templated_secrets.vault_k3s_token }}"`
	`2`	`+ "k3s_token": "{{ vault_k3s_token }}"`
`3`	`3`	`}`