Skip to content

Commit 6dff03f

Browse files
authored
Merge pull request #26 from stackhpc/2025.5.0-sync
Upgrade to 2025.5.0
2 parents 33f6bdb + 8c91638 commit 6dff03f

File tree

13 files changed

+235
-17
lines changed

13 files changed

+235
-17
lines changed
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
---
2+
name: Clean up stale CI resources
3+
on:
4+
schedule:
5+
# Every 2 hours at 8 minutes past
6+
- cron: '8 0/2 * * *'
7+
workflow_dispatch:
8+
inputs:
9+
delete-resources:
10+
type: boolean
11+
description: "Delete resources older than 6h"
12+
required: true
13+
delete-all-keypairs:
14+
type: boolean
15+
description: "Delete all CI user keypairs"
16+
required: true
17+
target-cloud:
18+
description: >-
19+
The cloud to target for the run.
20+
Leave blank to use the default cloud.
21+
type: choice
22+
options:
23+
- ""
24+
- arcus
25+
- leafcloud
26+
27+
28+
permissions: {}
29+
30+
jobs:
31+
ci-cleanup:
32+
name: Clean up stale CI resources
33+
if: github.repository == 'azimuth-cloud/azimuth-config'
34+
runs-on: ubuntu-latest
35+
permissions: {}
36+
steps:
37+
- name: Setup Python
38+
uses: actions/setup-python@v5
39+
40+
- name: Generate clouds.yaml
41+
run: |
42+
cat << EOF > clouds.yaml
43+
${{ secrets.OS_CLOUDS }}
44+
EOF
45+
46+
- name: Install OpenStack client
47+
run: |
48+
pip install python-openstackclient
49+
50+
- name: Clean up instances and attached volumes over 6 hours old
51+
if: ${{ github.event_name == 'schedule' || inputs.delete-resources }}
52+
run: |
53+
result=0
54+
changes_before=$(date -Imin -d -6hours)
55+
for status in ACTIVE BUILD ERROR SHUTOFF; do
56+
for instance in $(openstack server list --unlocked --format value --column ID --changes-before $changes_before --status $status); do
57+
echo "Cleaning up $status instance $instance"
58+
openstack server show $instance
59+
echo "Getting volumes for instance $instance"
60+
volumes=$(openstack server volume list -f value -c "Volume ID" $instance)
61+
keypair=$(openstack server show $instance -f value -c key_name)
62+
if ! openstack server delete $instance; then
63+
echo "Failed to delete $status instance $instance"
64+
result=1
65+
fi
66+
echo "Deleting keypair for instance $instance"
67+
# This shouldn't fail, but might if the keypair is in-use elsewhere
68+
openstack keypair delete $keypair || true
69+
for volume in $volumes; do
70+
echo "Cleaning up volume $volume from instance $instance"
71+
openstack volume show $volume
72+
if ! openstack volume delete $volume; then
73+
echo "Failed to delete volume $volume"
74+
result=1
75+
fi
76+
done
77+
done
78+
done
79+
exit $result
80+
env:
81+
OS_CLOUD: ${{ inputs.target-cloud || vars.TARGET_CLOUD }}
82+
83+
- name: Clean up all SSH keypairs
84+
if: ${{ inputs.delete-all-keypairs }}
85+
run: |
86+
for keypair in $(openstack keypair list --format value -c Name); do
87+
if [[ "$keypair" =~ ^azimuth- || "$keypair" =~ ^packer_ ]]; then
88+
openstack keypair delete $keypair
89+
echo "Deleted keypair $keypair"
90+
fi
91+
done
92+
env:
93+
OS_CLOUD: ${{ inputs.target-cloud || vars.TARGET_CLOUD }}

Tiltfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ settings = deep_merge(
7575
"release_namespace": "azimuth",
7676
},
7777
"coral-credits": {
78-
"release_namespace": "coral-credits",
78+
"release_namespace": "azimuth",
7979
},
8080
"cluster-api-addon-provider": {
8181
"release_namespace": "capi-addon-system",

bin/ci-setup

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ else
4949
fi
5050

5151
run_apt update
52-
run_apt install -y -t 'o=LP-PPA-mozillateam' firefox
52+
run_apt install -y -t 'o=LP-PPA-mozillateam' firefox-esr
5353

5454
pip install -U pip
5555
pip install -r requirements.txt

docs/debugging/kubernetes.md

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ NAME CLUSTER REPLICAS READY UPDATE
121121
machinedeployment.cluster.x-k8s.io/demo-sm0 demo 1 1 1 0 Running 11d v1.24.2
122122

123123
NAME PHASE AGE VERSION
124-
cluster.cluster.x-k8s.io/demo Provisioned 11d
124+
cluster.cluster.x-k8s.io/demo Provisioned 11d
125125

126126
NAME CLUSTER NODENAME PROVIDERID PHASE AGE VERSION
127127
machine.cluster.x-k8s.io/demo-control-plane-7p8zv demo demo-control-plane-7d76d0be-z6dm8 openstack:///f687f926-3cee-4550-91e5-32c2885708b0 Running 11d v1.24.2
@@ -133,7 +133,7 @@ NAME CLUSTER
133133
kubeadmcontrolplane.controlplane.cluster.x-k8s.io/demo-control-plane demo true true 3 3 3 0 11d v1.24.2
134134

135135
NAME CLUSTER READY NETWORK SUBNET BASTION IP
136-
openstackcluster.infrastructure.cluster.x-k8s.io/demo demo true 4b6b2722-ee5b-40ec-8e52-a6610e14cc51 73e22c49-10b8-4763-af2f-4c0cce007c82
136+
openstackcluster.infrastructure.cluster.x-k8s.io/demo demo true 4b6b2722-ee5b-40ec-8e52-a6610e14cc51 73e22c49-10b8-4763-af2f-4c0cce007c82
137137

138138
NAME CLUSTER INSTANCESTATE READY PROVIDERID MACHINE
139139
openstackmachine.infrastructure.cluster.x-k8s.io/demo-control-plane-7d76d0be-d2mcr demo ACTIVE true openstack:///ea91f79a-8abb-4cb9-a2ea-8f772568e93c demo-control-plane-9skvh
@@ -167,6 +167,52 @@ kubectl -n capo-system logs deploy/capo-controller-manager
167167
kubectl -n capi-addon-system logs deploy/cluster-api-addon-provider
168168
```
169169

170+
### Recovering clusters stuck in failed state after network disruption
171+
172+
If the underlying cloud infrastructure has undergone maintenance or suffered
173+
from temporary networking problems, clusters can get stuck in a 'Failed' state
174+
even after the network is recovered and the cluster is otherwise fully
175+
functional.
176+
This is can happen when `failureMessage` and `failureReason` are set, which
177+
Cluster API mistakenly interprets as an unrecoverable error and therefore
178+
changes the cluster's status to `Failed`. There are ongoing discussions in the
179+
Kubernetes community about resolving this mistaken interpretation of transient
180+
networking errors but for now this failed status must be manually cleared.
181+
182+
If you think this is the case, you can check for affected clusters with the following command:
183+
184+
```command title="On the K3s node, targetting the HA cluster if deployed"
185+
$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json | jq -r '.items[] | "\(.metadata.name): \(.status.failureMessage) \(.status.failureReason)"'
186+
```
187+
188+
Clusters where one or both of the `failure{Message,Reason}` fields is not
189+
`null` are affected.
190+
You can reset the status for an individual cluster by updating removing the
191+
failure message and reason fields using
192+
`kubectl edit --subresource=status clusters.cluster.x-k8s.io/<cluster-name>`.
193+
Alternatively, you can apply a patch to all workload clusters at once using the
194+
following command:
195+
196+
```command title="On the K3s node, targetting the HA cluster if deployed"
197+
# Shell command to extract the list of failed clusters and generate the required `kubectl patch` command for each one
198+
$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json \
199+
| jq -r '.items[] | select(.status.failureMessage or .status.failureReason) | "kubectl patch cluster.cluster.x-k8s.io \(.metadata.name) -n \(.metadata.namespace) --type=merge --subresource=status -p '\''{\"status\": {\"failureMessage\": null, \"failureReason\": null}}'\''"'
200+
kubectl patch cluster.cluster.x-k8s.io demo1 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
201+
kubectl patch cluster.cluster.x-k8s.io demo2 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
202+
kubectl patch cluster.cluster.x-k8s.io demo3 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
203+
kubectl patch cluster.cluster.x-k8s.io demo4 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}'
204+
205+
# Modification of the previous command which pipes the output into `sh` so that the `kubectl patch` commands are executed to fix the failed clusters
206+
$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json \
207+
| jq -r '.items[] | select(.status.failureMessage or .status.failureReason) | "kubectl patch cluster.cluster.x-k8s.io \(.metadata.name) -n \(.metadata.namespace) --type=merge --subresource=status -p '\''{\"status\": {\"failureMessage\": null, \"failureReason\": null}}'\''"' \
208+
| sh
209+
cluster.cluster.x-k8s.io/demo1 patched
210+
cluster.cluster.x-k8s.io/demo2 patched
211+
cluster.cluster.x-k8s.io/demo3 patched
212+
cluster.cluster.x-k8s.io/demo4 patched
213+
214+
```
215+
170216
## Accessing tenant clusters
171217

172218
The kubeconfigs for all tenant clusters are stored as secrets. First, you need

docs/operations/maintenance.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Maintenance
2+
3+
## Pausing reconciliation of tenant and management clusters
4+
5+
Kubernetes clusters will automatically reconcile when resources are detected as
6+
unavailable. Usually this is good, intended behaviour. However, if we have a
7+
known period of time where statuses are expected to be incorrect or
8+
unavailable, such as an outage window for OpenStack APIs, it is sensible
9+
pause reconciliation.
10+
11+
Reconciliation should be paused for all tenant clusters, and the CAPI management
12+
cluster.
13+
14+
### Tenant clusters
15+
16+
Follow these steps to access the Seed VM and target the management cluster.
17+
18+
Apply the annotation ``cluster.x-k8s.io/paused=true`` to all clusters.
19+
20+
```bash
21+
kubectl annotate --all --all-namespaces clusters.cluster.x-k8s.io cluster.x-k8s.io/paused=true
22+
cluster.cluster.x-k8s.io/test-1 annotated
23+
cluster.cluster.x-k8s.io/test-2 annotated
24+
```
25+
26+
After the system is back in a stable state, remove the
27+
``cluster.x-k8s.io/paused`` annotation.
28+
29+
```bash
30+
kubectl annotate --all --all-namespaces clusters.cluster.x-k8s.io cluster.x-k8s.io/paused-
31+
cluster.cluster.x-k8s.io/test-1 annotated
32+
cluster.cluster.x-k8s.io/test-2 annotated
33+
```
34+
35+
### Management cluster
36+
37+
Follow these steps to access the Seed VM and target the K3s cluster.
38+
39+
Get the name of the cluster.
40+
41+
```bash
42+
kubectl get clusters.cluster.x-k8s.io
43+
NAME CLUSTERCLASS PHASE AGE VERSION
44+
cluster-name Provisioned 365d
45+
```
46+
47+
Apply the annotation ``cluster.x-k8s.io/paused=true`` to the cluster.
48+
49+
```bash
50+
kubectl annotate clusters.cluster.x-k8s.io/cluster-name cluster.x-k8s.io/paused=true
51+
cluster.cluster.x-k8s.io/cluster-name annotated
52+
```
53+
54+
After the system is back in a stable state, remove the
55+
``cluster.x-k8s.io/paused`` annotation.
56+
57+
```bash
58+
kubectl annotate clusters.cluster.x-k8s.io/cluster-name cluster.x-k8s.io/paused-
59+
cluster.cluster.x-k8s.io/cluster-name annotated
60+
```

docs/repository/index.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,8 @@ git push --set-upstream origin upgrade/$RELEASE_TAG
153153

154154
You can now open a merge (or pull) request proposing the upgrade to your `main` branch
155155
that can be reviewed like any other.
156+
157+
Once the upgrade branch has been merged into your `main` branch, you can follow the
158+
steps for [Activating an environment](../deployment/index.md#activating-an-environment),
159+
and [Deploying Azimuth](../deployment/index.md#deploying-an-environment) to deploy the
160+
upgrade.

environments/base/inventory/group_vars/all.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ azimuth_apps_enabled: yes
6767
azimuth_kubernetes_enabled: yes
6868
azimuth_clusters_enabled: yes
6969

70+
# Indicates whether to install FluxCD on management cluster
71+
# (required to install Flux-based addons)
72+
flux_enabled: false
73+
7074
# The base domain for Azimuth ingress resources
7175
# This should be set by the concrete environment
7276
ingress_base_domain: "{{ undef(hint = 'ingress_base_domain is required') }}"
@@ -152,7 +156,7 @@ __os_auth_url: >-
152156
{{-
153157
lookup('file', __os_clouds_file) |
154158
from_yaml |
155-
json_query('clouds.' + __os_cloud + '.auth.auth_url') |
159+
json_query('clouds.' + '"%s"' % __os_cloud + '.auth.auth_url') |
156160
trim('/')
157161
}}
158162
azimuth_openstack_auth_url: "{{ __os_auth_url.removesuffix('/v3') }}/v3"

environments/capi-mgmt-example/inventory/group_vars/all/variables.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
# The Kubernetes version that will be used for the HA cluster
3535
# This should match the image specified image
36-
# capi_cluster_kubernetes_version: 1.29.11
36+
# capi_cluster_kubernetes_version: 1.30.12
3737

3838
# The name of the flavor to use for control plane nodes
3939
# At least 2 CPUs and 8GB RAM is required

environments/capi-mgmt/inventory/group_vars/all.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,28 +24,28 @@ infra_flavor_id: >-
2424
# Upload the Kubernetes image we need for the HA cluster as a private image
2525
# By default, we get the image from the azimuth-images version
2626
community_images_default:
27-
kube_1_29:
28-
name: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].name }}"
29-
source_url: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].url }}"
30-
checksum: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].checksum }}"
27+
kube_1_30:
28+
name: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].name }}"
29+
source_url: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].url }}"
30+
checksum: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].checksum }}"
3131
source_disk_format: "qcow2"
3232
container_format: "bare"
33-
kubernetes_version: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].kubernetes_version }}"
33+
kubernetes_version: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].kubernetes_version }}"
3434
community_images_default_visibility: private
3535
community_images_update_existing_visibility: false
3636

3737
capi_cluster_kubernetes_version: >-
3838
{{-
39-
community_images.kube_1_29.kubernetes_version
40-
if community_images is defined and 'kube_1_29' in community_images
39+
community_images.kube_1_30.kubernetes_version
40+
if community_images is defined and 'kube_1_30' in community_images
4141
else undef(hint = 'capi_cluster_kubernetes_version is required')
4242
}}
4343
capi_cluster_machine_image_id: >-
4444
{{-
45-
community_images_image_ids.kube_1_29
45+
community_images_image_ids.kube_1_30
4646
if (
4747
community_images_image_ids is defined and
48-
'kube_1_29' in community_images_image_ids
48+
'kube_1_30' in community_images_image_ids
4949
)
5050
else undef(hint = 'capi_cluster_machine_image_id is required')
5151
}}

environments/ecmwf-base/inventory/group_vars/all/variables.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
# The size in GB for the data volume
66
# This will hold all cluster data, including Kubernetes resources, and also PVC data
7-
infra_data_volume_size: 50
7+
infra_data_volume_size: 100
88

99
#####
1010
# Configuration for the HA cluster
@@ -180,3 +180,6 @@ velero_backup_schedule_timings: "0 0 * * *"
180180
# Time-to-live for existing backups (defaults to 1 week)
181181
# See https://pkg.go.dev/time#ParseDuration for duration format options
182182
velero_backup_schedule_ttl: "168h"
183+
184+
# Double size of Prometheus volume
185+
capi_cluster_addons_monitoring_prometheus_volume_size: 20Gi

0 commit comments

Comments
 (0)