Skip to content

Commit 7f0af9e

Browse files
committed
Merge branch 'main' into feature/k3s-monitoring
2 parents 8ca0407 + f23be23 commit 7f0af9e

File tree

6 files changed

+39
-25
lines changed

6 files changed

+39
-25
lines changed

.github/workflows/nightly-cleanup.yml

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,8 @@
11
name: Cleanup CI clusters
22
on:
33
workflow_dispatch:
4-
inputs:
5-
ci_cloud:
6-
description: 'Select the CI_CLOUD'
7-
required: true
8-
type: choice
9-
options:
10-
- LEAFCLOUD
11-
- SMS
12-
- ARCUS
134
schedule:
14-
- cron: '0 20 * * *' # Run at 8PM - image sync runs at midnight
5+
- cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight
156

167
jobs:
178
ci_cleanup:
@@ -52,20 +43,35 @@ jobs:
5243
- name: Find CI clusters
5344
run: |
5445
. venv/bin/activate
55-
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq)
56-
echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV
46+
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true)
47+
echo "DEBUG: Raw CI clusters: $CI_CLUSTERS"
48+
49+
if [[ -z "$CI_CLUSTERS" ]]; then
50+
echo "No matching CI clusters found."
51+
else
52+
# Flatten multiline value so can be passed as env var
53+
CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//')
54+
echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED"
55+
echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> $GITHUB_ENV
56+
fi
5757
shell: bash
5858

5959
- name: Delete clusters if control node not tagged with keep
6060
run: |
6161
. venv/bin/activate
62-
for cluster_prefix in ${CI_CLUSTERS}
62+
if [[ -z ${ci_clusters} ]]; then
63+
echo "No clusters to delete."
64+
exit 0
65+
fi
66+
67+
for cluster_prefix in ${ci_clusters}
6368
do
69+
echo "Processing cluster: $cluster_prefix"
6470
TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value)
6571
if [[ $TAGS =~ "keep" ]]; then
6672
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
6773
else
68-
yes | ./dev/delete-cluster.py ${cluster_prefix}
74+
./dev/delete-cluster.py ${cluster_prefix} --force
6975
fi
7076
done
7177
shell: bash

ansible/adhoc/rebuild.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,6 @@
1616
- command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}"
1717
delegate_to: localhost
1818
- wait_for_connection:
19+
delay: 60
20+
timeout: 600
21+

ansible/extras.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
- name: Install k9s
4141
become: yes
4242
hosts: k9s
43+
tags: k9s
4344
tasks:
4445
- import_role:
4546
name: k9s

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ resource "openstack_compute_instance_v2" "control" {
471471
{%- endif %}
472472
bootcmd:
473473
%{for volume in [openstack_blockstorage_volume_v3.state, {% if not cluster_home_manila_share | bool %} openstack_blockstorage_volume_v3.home {% endif %}]}
474-
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
474+
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${replace(substr(volume.id, 0, 20), "-", "*")}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
475475
%{endfor}
476476
mounts:
477477
- [LABEL=state, {{ appliances_state_dir }}, auto]

ansible/roles/k9s/tasks/main.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
- name: Check if k9s is installed
44
ansible.builtin.stat:
55
path: "/usr/bin/k9s"
6-
register: result
6+
register: _k9s_stat_result
77

88
- name: Install k9s and clean up temporary files
99
block:
@@ -14,7 +14,7 @@
1414
owner: root
1515
group: root
1616
mode: "744"
17-
when: not result.stat.exists
17+
when: not _k9s_stat_result.stat.exists
1818

1919
- name: Download k9s
2020
ansible.builtin.get_url:
@@ -41,4 +41,4 @@
4141
ansible.builtin.file:
4242
path: /tmp/k9s
4343
state: absent
44-
when: not result.stat.exists
44+
when: not _k9s_stat_result.stat.exists

dev/delete-cluster.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,18 @@
44
Delete infrastructure for a cluster without using Terraform. Useful for CI clusters.
55
66
Usage:
7-
delete-cluster.py PREFIX
7+
delete-cluster.py PREFIX [--force]
88
99
Where PREFIX is the string at the start of the resource's names.
10-
It will list matching resources and prompt to confirm deletion.
10+
If --force is provided, it will delete all resources without confirmation.
1111
"""
1212

13-
import sys, json, subprocess, pprint
13+
import sys, json, subprocess
1414

1515

1616
CLUSTER_RESOURCES = ['server', 'port', 'volume']
1717

18-
def delete_cluster(cluster_prefix):
18+
def delete_cluster(cluster_prefix, force=False):
1919
to_delete = {}
2020
for resource_type in CLUSTER_RESOURCES:
2121
to_delete[resource_type] = []
@@ -29,7 +29,8 @@ def delete_cluster(cluster_prefix):
2929
except:
3030
print(resource_type, item)
3131
raise
32-
if input('Delete these (y/n)?:') == 'y':
32+
33+
if force or input('Delete these (y/n)?:') == 'y':
3334
for resource_type in CLUSTER_RESOURCES:
3435
items = [v['ID'] for v in to_delete[resource_type]]
3536
if items:
@@ -40,7 +41,10 @@ def delete_cluster(cluster_prefix):
4041
print('Cancelled - no resources deleted')
4142

4243
if __name__ == '__main__':
43-
if len(sys.argv) != 2:
44+
if len(sys.argv) < 2 or len(sys.argv) > 3:
4445
print('ERROR: Incorrect argument(s).\n' + __doc__)
4546
exit(1)
46-
delete_cluster(sys.argv[1])
47+
force_flag = '--force' in sys.argv
48+
cluster_prefix = sys.argv[1]
49+
delete_cluster(cluster_prefix, force_flag)
50+

0 commit comments

Comments
 (0)