Merge pull request #26 from stackhpc/2025.5.0-sync

MoteHue · web-flow · commit 6dff03f73dc8 · 2025-10-14T14:10:06.000+01:00
Upgrade to 2025.5.0
diff --git a/.github/workflows/cleanup-ci-resources.yml b/.github/workflows/cleanup-ci-resources.yml
@@ -0,0 +1,93 @@
+---
+name: Clean up stale CI resources
+on:
+  schedule:
+    # Every 2 hours at 8 minutes past
+    - cron: '8 0/2 * * *'
+  workflow_dispatch:
+    inputs:
+      delete-resources:
+        type: boolean
+        description: "Delete resources older than 6h"
+        required: true
+      delete-all-keypairs:
+        type: boolean
+        description: "Delete all CI user keypairs"
+        required: true
+      target-cloud:
+        description: >-
+          The cloud to target for the run.
+          Leave blank to use the default cloud.
+        type: choice
+        options:
+          - ""
+          - arcus
+          - leafcloud
+
+      
+permissions: {}
+
+jobs:
+  ci-cleanup:
+    name: Clean up stale CI resources
+    if: github.repository == 'azimuth-cloud/azimuth-config'
+    runs-on: ubuntu-latest
+    permissions: {}
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+
+      - name: Generate clouds.yaml
+        run: |
+          cat << EOF > clouds.yaml
+          ${{ secrets.OS_CLOUDS }}
+          EOF
+
+      - name: Install OpenStack client
+        run: |
+          pip install python-openstackclient
+
+      - name: Clean up instances and attached volumes over 6 hours old
+        if: ${{ github.event_name == 'schedule' || inputs.delete-resources }}
+        run: |
+          result=0
+          changes_before=$(date -Imin -d -6hours)
+          for status in ACTIVE BUILD ERROR SHUTOFF; do
+              for instance in $(openstack server list --unlocked --format value --column ID --changes-before $changes_before --status $status); do
+                  echo "Cleaning up $status instance $instance"
+                  openstack server show $instance
+                  echo "Getting volumes for instance $instance"
+                  volumes=$(openstack server volume list -f value -c "Volume ID" $instance)
+                  keypair=$(openstack server show $instance -f value -c key_name)
+                  if ! openstack server delete $instance; then
+                      echo "Failed to delete $status instance $instance"
+                      result=1
+                  fi
+                  echo "Deleting keypair for instance $instance"
+                  # This shouldn't fail, but might if the keypair is in-use elsewhere
+                  openstack keypair delete $keypair || true
+                  for volume in $volumes; do
+                    echo "Cleaning up volume $volume from instance $instance"
+                    openstack volume show $volume
+                    if ! openstack volume delete $volume; then
+                      echo "Failed to delete volume $volume"
+                      result=1
+                    fi
+                  done
+              done
+          done
+          exit $result
+        env:
+          OS_CLOUD: ${{ inputs.target-cloud || vars.TARGET_CLOUD }}
+        
+      - name: Clean up all SSH keypairs
+        if: ${{ inputs.delete-all-keypairs }}
+        run: |
+          for keypair in $(openstack keypair list --format value -c Name); do
+            if [[ "$keypair" =~ ^azimuth- || "$keypair" =~ ^packer_ ]]; then
+              openstack keypair delete $keypair
+              echo "Deleted keypair $keypair"
+            fi
+          done
+        env:
+          OS_CLOUD: ${{ inputs.target-cloud || vars.TARGET_CLOUD }}
diff --git a/Tiltfile b/Tiltfile
@@ -75,7 +75,7 @@ settings = deep_merge(
                 "release_namespace": "azimuth",
             },
             "coral-credits": {
-                "release_namespace": "coral-credits",
+                "release_namespace": "azimuth",
             },
             "cluster-api-addon-provider": {
                 "release_namespace": "capi-addon-system",
diff --git a/bin/ci-setup b/bin/ci-setup
@@ -49,7 +49,7 @@ else
 fi
 
 run_apt update
-run_apt install -y -t 'o=LP-PPA-mozillateam' firefox
+run_apt install -y -t 'o=LP-PPA-mozillateam' firefox-esr
 
 pip install -U pip
 pip install -r requirements.txt
diff --git a/docs/debugging/kubernetes.md b/docs/debugging/kubernetes.md
@@ -121,7 +121,7 @@ NAME                                          CLUSTER  REPLICAS   READY   UPDATE
 machinedeployment.cluster.x-k8s.io/demo-sm0   demo     1          1       1         0             Running   11d   v1.24.2
 
 NAME                            PHASE         AGE   VERSION
-cluster.cluster.x-k8s.io/demo   Provisioned   11d   
+cluster.cluster.x-k8s.io/demo   Provisioned   11d
 
 NAME                                                CLUSTER  NODENAME                            PROVIDERID                                          PHASE     AGE   VERSION
 machine.cluster.x-k8s.io/demo-control-plane-7p8zv   demo     demo-control-plane-7d76d0be-z6dm8   openstack:///f687f926-3cee-4550-91e5-32c2885708b0   Running   11d   v1.24.2
@@ -133,7 +133,7 @@ NAME                                                                   CLUSTER
 kubeadmcontrolplane.controlplane.cluster.x-k8s.io/demo-control-plane   demo     true          true                   3          3       3         0             11d   v1.24.2
 
 NAME                                                    CLUSTER  READY   NETWORK                                SUBNET                                 BASTION IP
-openstackcluster.infrastructure.cluster.x-k8s.io/demo   demo     true    4b6b2722-ee5b-40ec-8e52-a6610e14cc51   73e22c49-10b8-4763-af2f-4c0cce007c82   
+openstackcluster.infrastructure.cluster.x-k8s.io/demo   demo     true    4b6b2722-ee5b-40ec-8e52-a6610e14cc51   73e22c49-10b8-4763-af2f-4c0cce007c82
 
 NAME                                                                                 CLUSTER  INSTANCESTATE   READY   PROVIDERID                                          MACHINE
 openstackmachine.infrastructure.cluster.x-k8s.io/demo-control-plane-7d76d0be-d2mcr   demo     ACTIVE          true    openstack:///ea91f79a-8abb-4cb9-a2ea-8f772568e93c   demo-control-plane-9skvh
@@ -167,6 +167,52 @@ kubectl -n capo-system logs deploy/capo-controller-manager
 kubectl -n capi-addon-system logs deploy/cluster-api-addon-provider
 ```
 
+### Recovering clusters stuck in failed state after network disruption
+
+If the underlying cloud infrastructure has undergone maintenance or suffered
+from temporary networking problems, clusters can get stuck in a 'Failed' state
+even after the network is recovered and the cluster is otherwise fully
+functional.
+This is can happen when `failureMessage` and `failureReason` are set, which
+Cluster API mistakenly interprets as an unrecoverable error and therefore
+changes the cluster's status to `Failed`. There are ongoing discussions in the
+Kubernetes community about resolving this mistaken interpretation of transient
+networking errors but for now this failed status must be manually cleared.
+
+If you think this is the case, you can check for affected clusters with the following command:
+
+```command  title="On the K3s node, targetting the HA cluster if deployed"
+$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json | jq -r '.items[] | "\(.metadata.name): \(.status.failureMessage) \(.status.failureReason)"'
+```
+
+Clusters where one or both of the `failure{Message,Reason}` fields is not
+`null` are affected.
+You can reset the status for an individual cluster by updating removing the
+failure message and reason fields using
+`kubectl edit --subresource=status clusters.cluster.x-k8s.io/<cluster-name>`.
+Alternatively, you can apply a patch to all workload clusters at once using the
+following command:
+
+```command  title="On the K3s node, targetting the HA cluster if deployed"
+# Shell command to extract the list of failed clusters and generate the required `kubectl patch` command for each one
+$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json \
+| jq -r '.items[] | select(.status.failureMessage or .status.failureReason) | "kubectl patch cluster.cluster.x-k8s.io \(.metadata.name) -n \(.metadata.namespace) --type=merge --subresource=status -p '\''{\"status\": {\"failureMessage\": null, \"failureReason\": null}}'\''"'
+kubectl patch cluster.cluster.x-k8s.io demo1 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
+kubectl patch cluster.cluster.x-k8s.io demo2 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
+kubectl patch cluster.cluster.x-k8s.io demo3 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
+kubectl patch cluster.cluster.x-k8s.io demo4 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
+
+# Modification of the previous command which pipes the output into `sh` so that the `kubectl patch` commands are executed to fix the failed clusters
+$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json \
+| jq -r '.items[] | select(.status.failureMessage or .status.failureReason) | "kubectl patch cluster.cluster.x-k8s.io \(.metadata.name) -n \(.metadata.namespace) --type=merge --subresource=status -p '\''{\"status\": {\"failureMessage\": null, \"failureReason\": null}}'\''"' \
+| sh
+cluster.cluster.x-k8s.io/demo1 patched
+cluster.cluster.x-k8s.io/demo2 patched
+cluster.cluster.x-k8s.io/demo3 patched
+cluster.cluster.x-k8s.io/demo4 patched
+
+```
+
 ## Accessing tenant clusters
 
 The kubeconfigs for all tenant clusters are stored as secrets. First, you need
diff --git a/docs/operations/maintenance.md b/docs/operations/maintenance.md
@@ -0,0 +1,60 @@
+# Maintenance
+
+## Pausing reconciliation of tenant and management clusters
+
+Kubernetes clusters will automatically reconcile when resources are detected as
+unavailable. Usually this is good, intended behaviour. However, if we have a
+known period of time where statuses are expected to be incorrect or
+unavailable, such as an outage window for OpenStack APIs, it is sensible 
+pause reconciliation.
+
+Reconciliation should be paused for all tenant clusters, and the CAPI management
+cluster.
+
+### Tenant clusters
+
+Follow these steps to access the Seed VM and target the management cluster.
+
+Apply the annotation ``cluster.x-k8s.io/paused=true`` to all clusters.
+
+```bash
+kubectl annotate --all --all-namespaces clusters.cluster.x-k8s.io cluster.x-k8s.io/paused=true
+cluster.cluster.x-k8s.io/test-1 annotated
+cluster.cluster.x-k8s.io/test-2 annotated
+```
+
+After the system is back in a stable state, remove the
+``cluster.x-k8s.io/paused`` annotation.
+
+```bash
+kubectl annotate --all --all-namespaces clusters.cluster.x-k8s.io cluster.x-k8s.io/paused-
+cluster.cluster.x-k8s.io/test-1 annotated
+cluster.cluster.x-k8s.io/test-2 annotated
+```
+
+### Management cluster
+
+Follow these steps to access the Seed VM and target the K3s cluster.
+
+Get the name of the cluster.
+
+```bash
+kubectl get clusters.cluster.x-k8s.io
+NAME           CLUSTERCLASS   PHASE         AGE    VERSION
+cluster-name                  Provisioned   365d
+```
+
+Apply the annotation ``cluster.x-k8s.io/paused=true`` to the cluster.
+
+```bash
+kubectl annotate clusters.cluster.x-k8s.io/cluster-name cluster.x-k8s.io/paused=true
+cluster.cluster.x-k8s.io/cluster-name annotated
+```
+
+After the system is back in a stable state, remove the
+``cluster.x-k8s.io/paused`` annotation.
+
+```bash
+kubectl annotate clusters.cluster.x-k8s.io/cluster-name cluster.x-k8s.io/paused-
+cluster.cluster.x-k8s.io/cluster-name annotated
+```
diff --git a/docs/repository/index.md b/docs/repository/index.md
@@ -153,3 +153,8 @@ git push --set-upstream origin upgrade/$RELEASE_TAG
 
 You can now open a merge (or pull) request proposing the upgrade to your `main` branch
 that can be reviewed like any other.
+
+Once the upgrade branch has been merged into your `main` branch, you can follow the
+steps for [Activating an environment](../deployment/index.md#activating-an-environment),
+and [Deploying Azimuth](../deployment/index.md#deploying-an-environment) to deploy the
+upgrade.
diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml
@@ -67,6 +67,10 @@ azimuth_apps_enabled: yes
 azimuth_kubernetes_enabled: yes
 azimuth_clusters_enabled: yes
 
+# Indicates whether to install FluxCD on management cluster 
+# (required to install Flux-based addons)
+flux_enabled: false
+
 # The base domain for Azimuth ingress resources
 # This should be set by the concrete environment
 ingress_base_domain: "{{ undef(hint = 'ingress_base_domain is required') }}"
@@ -152,7 +156,7 @@ __os_auth_url: >-
   {{-
     lookup('file', __os_clouds_file) |
       from_yaml |
-      json_query('clouds.' + __os_cloud + '.auth.auth_url') |
+      json_query('clouds.' + '"%s"' % __os_cloud + '.auth.auth_url') |
       trim('/')
   }}
 azimuth_openstack_auth_url: "{{ __os_auth_url.removesuffix('/v3') }}/v3"
diff --git a/environments/capi-mgmt-example/inventory/group_vars/all/variables.yml b/environments/capi-mgmt-example/inventory/group_vars/all/variables.yml
@@ -33,7 +33,7 @@
 
 # The Kubernetes version that will be used for the HA cluster
 # This should match the image specified image
-# capi_cluster_kubernetes_version: 1.29.11
+# capi_cluster_kubernetes_version: 1.30.12
 
 # The name of the flavor to use for control plane nodes
 # At least 2 CPUs and 8GB RAM is required
diff --git a/environments/capi-mgmt/inventory/group_vars/all.yml b/environments/capi-mgmt/inventory/group_vars/all.yml
@@ -24,28 +24,28 @@ infra_flavor_id: >-
 # Upload the Kubernetes image we need for the HA cluster as a private image
 # By default, we get the image from the azimuth-images version
 community_images_default:
-  kube_1_29:
-    name: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].name }}"
-    source_url: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].url }}"
-    checksum: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].checksum }}"
+  kube_1_30:
+    name: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].name }}"
+    source_url: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].url }}"
+    checksum: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].checksum }}"
     source_disk_format: "qcow2"
     container_format: "bare"
-    kubernetes_version: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].kubernetes_version }}"
+    kubernetes_version: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].kubernetes_version }}"
 community_images_default_visibility: private
 community_images_update_existing_visibility: false
 
 capi_cluster_kubernetes_version: >-
   {{-
-    community_images.kube_1_29.kubernetes_version
-    if community_images is defined and 'kube_1_29' in community_images
+    community_images.kube_1_30.kubernetes_version
+    if community_images is defined and 'kube_1_30' in community_images
     else undef(hint = 'capi_cluster_kubernetes_version is required')
   }}
 capi_cluster_machine_image_id: >-
   {{-
-    community_images_image_ids.kube_1_29
+    community_images_image_ids.kube_1_30
     if (
       community_images_image_ids is defined and
-      'kube_1_29' in community_images_image_ids
+      'kube_1_30' in community_images_image_ids
     )
     else undef(hint = 'capi_cluster_machine_image_id is required')
   }}
diff --git a/environments/ecmwf-base/inventory/group_vars/all/variables.yml b/environments/ecmwf-base/inventory/group_vars/all/variables.yml
@@ -4,7 +4,7 @@
 
 # The size in GB for the data volume
 # This will hold all cluster data, including Kubernetes resources, and also PVC data
-infra_data_volume_size: 50
+infra_data_volume_size: 100
 
 #####
 # Configuration for the HA cluster
@@ -180,3 +180,6 @@ velero_backup_schedule_timings: "0 0 * * *"
 # Time-to-live for existing backups (defaults to 1 week)
 # See https://pkg.go.dev/time#ParseDuration for duration format options
 velero_backup_schedule_ttl: "168h"
+
+# Double size of Prometheus volume
+capi_cluster_addons_monitoring_prometheus_volume_size: 20Gi
diff --git a/environments/tav1/inventory/group_vars/all/variables.yml b/environments/tav1/inventory/group_vars/all/variables.yml
@@ -57,6 +57,11 @@ capi_cluster_etcd_blockdevice_volume_type: "faf12dcf-2b93-4e04-9de1-57ec3f0cd58a
 # Default volume type for the etcd block device if 'Volume' type is used
 azimuth_capi_operator_capi_helm_etcd_blockdevice_volume_type: "faf12dcf-2b93-4e04-9de1-57ec3f0cd58a" # __DEFAULT__ 
 
+# The name of the flavor to use for control plane nodes
+capi_cluster_control_plane_flavor: 8cpu-8gbmem-30gbdisk
+# The name of the flavor to use for worker nodes
+capi_cluster_worker_flavor: 8cpu-16gbmem-30gbdisk-unpinned
+
 #####
 # Ingress configuration
 #####
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -37,6 +37,8 @@ nav:
       - deployment/index.md
       - deployment/automation.md
       - deployment/testing.md
+  - Operations:
+      - operations/maintenance.md
   - Debugging:
       - debugging/index.md
       - debugging/access-k3s.md
diff --git a/requirements.yml b/requirements.yml
@@ -3,7 +3,7 @@
 collections:
   - name: https://github.com/azimuth-cloud/ansible-collection-azimuth-ops.git
     type: git
-    version: 0.14.4
+    version: 0.15.0
   # For local development
   # - type: dir
   #   source: ../ansible-collection-azimuth-ops