diff --git a/.github/workflows/cleanup-ci-resources.yml b/.github/workflows/cleanup-ci-resources.yml new file mode 100644 index 00000000..d5ed28c5 --- /dev/null +++ b/.github/workflows/cleanup-ci-resources.yml @@ -0,0 +1,93 @@ +--- +name: Clean up stale CI resources +on: + schedule: + # Every 2 hours at 8 minutes past + - cron: '8 0/2 * * *' + workflow_dispatch: + inputs: + delete-resources: + type: boolean + description: "Delete resources older than 6h" + required: true + delete-all-keypairs: + type: boolean + description: "Delete all CI user keypairs" + required: true + target-cloud: + description: >- + The cloud to target for the run. + Leave blank to use the default cloud. + type: choice + options: + - "" + - arcus + - leafcloud + + +permissions: {} + +jobs: + ci-cleanup: + name: Clean up stale CI resources + if: github.repository == 'azimuth-cloud/azimuth-config' + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Setup Python + uses: actions/setup-python@v5 + + - name: Generate clouds.yaml + run: | + cat << EOF > clouds.yaml + ${{ secrets.OS_CLOUDS }} + EOF + + - name: Install OpenStack client + run: | + pip install python-openstackclient + + - name: Clean up instances and attached volumes over 6 hours old + if: ${{ github.event_name == 'schedule' || inputs.delete-resources }} + run: | + result=0 + changes_before=$(date -Imin -d -6hours) + for status in ACTIVE BUILD ERROR SHUTOFF; do + for instance in $(openstack server list --unlocked --format value --column ID --changes-before $changes_before --status $status); do + echo "Cleaning up $status instance $instance" + openstack server show $instance + echo "Getting volumes for instance $instance" + volumes=$(openstack server volume list -f value -c "Volume ID" $instance) + keypair=$(openstack server show $instance -f value -c key_name) + if ! openstack server delete $instance; then + echo "Failed to delete $status instance $instance" + result=1 + fi + echo "Deleting keypair for instance $instance" + # This shouldn't fail, but might if the keypair is in-use elsewhere + openstack keypair delete $keypair || true + for volume in $volumes; do + echo "Cleaning up volume $volume from instance $instance" + openstack volume show $volume + if ! openstack volume delete $volume; then + echo "Failed to delete volume $volume" + result=1 + fi + done + done + done + exit $result + env: + OS_CLOUD: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} + + - name: Clean up all SSH keypairs + if: ${{ inputs.delete-all-keypairs }} + run: | + for keypair in $(openstack keypair list --format value -c Name); do + if [[ "$keypair" =~ ^azimuth- || "$keypair" =~ ^packer_ ]]; then + openstack keypair delete $keypair + echo "Deleted keypair $keypair" + fi + done + env: + OS_CLOUD: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} diff --git a/Tiltfile b/Tiltfile index f83da3bf..eb411774 100644 --- a/Tiltfile +++ b/Tiltfile @@ -75,7 +75,7 @@ settings = deep_merge( "release_namespace": "azimuth", }, "coral-credits": { - "release_namespace": "coral-credits", + "release_namespace": "azimuth", }, "cluster-api-addon-provider": { "release_namespace": "capi-addon-system", diff --git a/bin/ci-setup b/bin/ci-setup index e45a3a87..ced7839c 100755 --- a/bin/ci-setup +++ b/bin/ci-setup @@ -49,7 +49,7 @@ else fi run_apt update -run_apt install -y -t 'o=LP-PPA-mozillateam' firefox +run_apt install -y -t 'o=LP-PPA-mozillateam' firefox-esr pip install -U pip pip install -r requirements.txt diff --git a/docs/debugging/kubernetes.md b/docs/debugging/kubernetes.md index 3a51dd39..4d4f8500 100644 --- a/docs/debugging/kubernetes.md +++ b/docs/debugging/kubernetes.md @@ -121,7 +121,7 @@ NAME CLUSTER REPLICAS READY UPDATE machinedeployment.cluster.x-k8s.io/demo-sm0 demo 1 1 1 0 Running 11d v1.24.2 NAME PHASE AGE VERSION -cluster.cluster.x-k8s.io/demo Provisioned 11d +cluster.cluster.x-k8s.io/demo Provisioned 11d NAME CLUSTER NODENAME PROVIDERID PHASE AGE VERSION machine.cluster.x-k8s.io/demo-control-plane-7p8zv demo demo-control-plane-7d76d0be-z6dm8 openstack:///f687f926-3cee-4550-91e5-32c2885708b0 Running 11d v1.24.2 @@ -133,7 +133,7 @@ NAME CLUSTER kubeadmcontrolplane.controlplane.cluster.x-k8s.io/demo-control-plane demo true true 3 3 3 0 11d v1.24.2 NAME CLUSTER READY NETWORK SUBNET BASTION IP -openstackcluster.infrastructure.cluster.x-k8s.io/demo demo true 4b6b2722-ee5b-40ec-8e52-a6610e14cc51 73e22c49-10b8-4763-af2f-4c0cce007c82 +openstackcluster.infrastructure.cluster.x-k8s.io/demo demo true 4b6b2722-ee5b-40ec-8e52-a6610e14cc51 73e22c49-10b8-4763-af2f-4c0cce007c82 NAME CLUSTER INSTANCESTATE READY PROVIDERID MACHINE openstackmachine.infrastructure.cluster.x-k8s.io/demo-control-plane-7d76d0be-d2mcr demo ACTIVE true openstack:///ea91f79a-8abb-4cb9-a2ea-8f772568e93c demo-control-plane-9skvh @@ -167,6 +167,52 @@ kubectl -n capo-system logs deploy/capo-controller-manager kubectl -n capi-addon-system logs deploy/cluster-api-addon-provider ``` +### Recovering clusters stuck in failed state after network disruption + +If the underlying cloud infrastructure has undergone maintenance or suffered +from temporary networking problems, clusters can get stuck in a 'Failed' state +even after the network is recovered and the cluster is otherwise fully +functional. +This is can happen when `failureMessage` and `failureReason` are set, which +Cluster API mistakenly interprets as an unrecoverable error and therefore +changes the cluster's status to `Failed`. There are ongoing discussions in the +Kubernetes community about resolving this mistaken interpretation of transient +networking errors but for now this failed status must be manually cleared. + +If you think this is the case, you can check for affected clusters with the following command: + +```command title="On the K3s node, targetting the HA cluster if deployed" +$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json | jq -r '.items[] | "\(.metadata.name): \(.status.failureMessage) \(.status.failureReason)"' +``` + +Clusters where one or both of the `failure{Message,Reason}` fields is not +`null` are affected. +You can reset the status for an individual cluster by updating removing the +failure message and reason fields using +`kubectl edit --subresource=status clusters.cluster.x-k8s.io/`. +Alternatively, you can apply a patch to all workload clusters at once using the +following command: + +```command title="On the K3s node, targetting the HA cluster if deployed" +# Shell command to extract the list of failed clusters and generate the required `kubectl patch` command for each one +$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json \ +| jq -r '.items[] | select(.status.failureMessage or .status.failureReason) | "kubectl patch cluster.cluster.x-k8s.io \(.metadata.name) -n \(.metadata.namespace) --type=merge --subresource=status -p '\''{\"status\": {\"failureMessage\": null, \"failureReason\": null}}'\''"' +kubectl patch cluster.cluster.x-k8s.io demo1 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}' +kubectl patch cluster.cluster.x-k8s.io demo2 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}' +kubectl patch cluster.cluster.x-k8s.io demo3 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}' +kubectl patch cluster.cluster.x-k8s.io demo4 -n az-demo --type=merge --subresource=status -p '{"status": {"failureMessage": null, "failureReason": null}}' + +# Modification of the previous command which pipes the output into `sh` so that the `kubectl patch` commands are executed to fix the failed clusters +$ kubectl get cluster.cluster.x-k8s.io --all-namespaces -o json \ +| jq -r '.items[] | select(.status.failureMessage or .status.failureReason) | "kubectl patch cluster.cluster.x-k8s.io \(.metadata.name) -n \(.metadata.namespace) --type=merge --subresource=status -p '\''{\"status\": {\"failureMessage\": null, \"failureReason\": null}}'\''"' \ +| sh +cluster.cluster.x-k8s.io/demo1 patched +cluster.cluster.x-k8s.io/demo2 patched +cluster.cluster.x-k8s.io/demo3 patched +cluster.cluster.x-k8s.io/demo4 patched + +``` + ## Accessing tenant clusters The kubeconfigs for all tenant clusters are stored as secrets. First, you need diff --git a/docs/operations/maintenance.md b/docs/operations/maintenance.md new file mode 100644 index 00000000..c1469a11 --- /dev/null +++ b/docs/operations/maintenance.md @@ -0,0 +1,60 @@ +# Maintenance + +## Pausing reconciliation of tenant and management clusters + +Kubernetes clusters will automatically reconcile when resources are detected as +unavailable. Usually this is good, intended behaviour. However, if we have a +known period of time where statuses are expected to be incorrect or +unavailable, such as an outage window for OpenStack APIs, it is sensible +pause reconciliation. + +Reconciliation should be paused for all tenant clusters, and the CAPI management +cluster. + +### Tenant clusters + +Follow these steps to access the Seed VM and target the management cluster. + +Apply the annotation ``cluster.x-k8s.io/paused=true`` to all clusters. + +```bash +kubectl annotate --all --all-namespaces clusters.cluster.x-k8s.io cluster.x-k8s.io/paused=true +cluster.cluster.x-k8s.io/test-1 annotated +cluster.cluster.x-k8s.io/test-2 annotated +``` + +After the system is back in a stable state, remove the +``cluster.x-k8s.io/paused`` annotation. + +```bash +kubectl annotate --all --all-namespaces clusters.cluster.x-k8s.io cluster.x-k8s.io/paused- +cluster.cluster.x-k8s.io/test-1 annotated +cluster.cluster.x-k8s.io/test-2 annotated +``` + +### Management cluster + +Follow these steps to access the Seed VM and target the K3s cluster. + +Get the name of the cluster. + +```bash +kubectl get clusters.cluster.x-k8s.io +NAME CLUSTERCLASS PHASE AGE VERSION +cluster-name Provisioned 365d +``` + +Apply the annotation ``cluster.x-k8s.io/paused=true`` to the cluster. + +```bash +kubectl annotate clusters.cluster.x-k8s.io/cluster-name cluster.x-k8s.io/paused=true +cluster.cluster.x-k8s.io/cluster-name annotated +``` + +After the system is back in a stable state, remove the +``cluster.x-k8s.io/paused`` annotation. + +```bash +kubectl annotate clusters.cluster.x-k8s.io/cluster-name cluster.x-k8s.io/paused- +cluster.cluster.x-k8s.io/cluster-name annotated +``` diff --git a/docs/repository/index.md b/docs/repository/index.md index 1b73e8b0..e5343808 100644 --- a/docs/repository/index.md +++ b/docs/repository/index.md @@ -153,3 +153,8 @@ git push --set-upstream origin upgrade/$RELEASE_TAG You can now open a merge (or pull) request proposing the upgrade to your `main` branch that can be reviewed like any other. + +Once the upgrade branch has been merged into your `main` branch, you can follow the +steps for [Activating an environment](../deployment/index.md#activating-an-environment), +and [Deploying Azimuth](../deployment/index.md#deploying-an-environment) to deploy the +upgrade. \ No newline at end of file diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml index 72f0cf3d..87f60560 100644 --- a/environments/base/inventory/group_vars/all.yml +++ b/environments/base/inventory/group_vars/all.yml @@ -67,6 +67,10 @@ azimuth_apps_enabled: yes azimuth_kubernetes_enabled: yes azimuth_clusters_enabled: yes +# Indicates whether to install FluxCD on management cluster +# (required to install Flux-based addons) +flux_enabled: false + # The base domain for Azimuth ingress resources # This should be set by the concrete environment ingress_base_domain: "{{ undef(hint = 'ingress_base_domain is required') }}" @@ -152,7 +156,7 @@ __os_auth_url: >- {{- lookup('file', __os_clouds_file) | from_yaml | - json_query('clouds.' + __os_cloud + '.auth.auth_url') | + json_query('clouds.' + '"%s"' % __os_cloud + '.auth.auth_url') | trim('/') }} azimuth_openstack_auth_url: "{{ __os_auth_url.removesuffix('/v3') }}/v3" diff --git a/environments/capi-mgmt-example/inventory/group_vars/all/variables.yml b/environments/capi-mgmt-example/inventory/group_vars/all/variables.yml index bf69ba3a..6cc5354f 100644 --- a/environments/capi-mgmt-example/inventory/group_vars/all/variables.yml +++ b/environments/capi-mgmt-example/inventory/group_vars/all/variables.yml @@ -33,7 +33,7 @@ # The Kubernetes version that will be used for the HA cluster # This should match the image specified image -# capi_cluster_kubernetes_version: 1.29.11 +# capi_cluster_kubernetes_version: 1.30.12 # The name of the flavor to use for control plane nodes # At least 2 CPUs and 8GB RAM is required diff --git a/environments/capi-mgmt/inventory/group_vars/all.yml b/environments/capi-mgmt/inventory/group_vars/all.yml index 14585dfe..1fbab3fb 100644 --- a/environments/capi-mgmt/inventory/group_vars/all.yml +++ b/environments/capi-mgmt/inventory/group_vars/all.yml @@ -24,28 +24,28 @@ infra_flavor_id: >- # Upload the Kubernetes image we need for the HA cluster as a private image # By default, we get the image from the azimuth-images version community_images_default: - kube_1_29: - name: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].name }}" - source_url: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].url }}" - checksum: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].checksum }}" + kube_1_30: + name: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].name }}" + source_url: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].url }}" + checksum: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].checksum }}" source_disk_format: "qcow2" container_format: "bare" - kubernetes_version: "{{ community_images_azimuth_images_manifest['kubernetes-1-29-jammy'].kubernetes_version }}" + kubernetes_version: "{{ community_images_azimuth_images_manifest['kubernetes-1-30-jammy'].kubernetes_version }}" community_images_default_visibility: private community_images_update_existing_visibility: false capi_cluster_kubernetes_version: >- {{- - community_images.kube_1_29.kubernetes_version - if community_images is defined and 'kube_1_29' in community_images + community_images.kube_1_30.kubernetes_version + if community_images is defined and 'kube_1_30' in community_images else undef(hint = 'capi_cluster_kubernetes_version is required') }} capi_cluster_machine_image_id: >- {{- - community_images_image_ids.kube_1_29 + community_images_image_ids.kube_1_30 if ( community_images_image_ids is defined and - 'kube_1_29' in community_images_image_ids + 'kube_1_30' in community_images_image_ids ) else undef(hint = 'capi_cluster_machine_image_id is required') }} diff --git a/mkdocs.yml b/mkdocs.yml index 9a1144da..5000d42d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -37,6 +37,8 @@ nav: - deployment/index.md - deployment/automation.md - deployment/testing.md + - Operations: + - operations/maintenance.md - Debugging: - debugging/index.md - debugging/access-k3s.md diff --git a/requirements.yml b/requirements.yml index 433503ef..00896d29 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-azimuth-ops.git type: git - version: 0.14.4 + version: 0.15.0 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops