Skip to content

Commit 1e1779c

Browse files
authored
Merge branch 'main' into feat/compute-script
2 parents feab4cf + e44e084 commit 1e1779c

File tree

26 files changed

+324
-36
lines changed

26 files changed

+324
-36
lines changed

.github/workflows/nightly-cleanup.yml

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,8 @@
11
name: Cleanup CI clusters
22
on:
33
workflow_dispatch:
4-
inputs:
5-
ci_cloud:
6-
description: 'Select the CI_CLOUD'
7-
required: true
8-
type: choice
9-
options:
10-
- LEAFCLOUD
11-
- SMS
12-
- ARCUS
134
schedule:
14-
- cron: '0 20 * * *' # Run at 8PM - image sync runs at midnight
5+
- cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight
156

167
jobs:
178
ci_cleanup:
@@ -52,20 +43,35 @@ jobs:
5243
- name: Find CI clusters
5344
run: |
5445
. venv/bin/activate
55-
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq)
56-
echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV
46+
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true)
47+
echo "DEBUG: Raw CI clusters: $CI_CLUSTERS"
48+
49+
if [[ -z "$CI_CLUSTERS" ]]; then
50+
echo "No matching CI clusters found."
51+
else
52+
# Flatten multiline value so can be passed as env var
53+
CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//')
54+
echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED"
55+
echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> $GITHUB_ENV
56+
fi
5757
shell: bash
5858

5959
- name: Delete clusters if control node not tagged with keep
6060
run: |
6161
. venv/bin/activate
62-
for cluster_prefix in ${CI_CLUSTERS}
62+
if [[ -z ${ci_clusters} ]]; then
63+
echo "No clusters to delete."
64+
exit 0
65+
fi
66+
67+
for cluster_prefix in ${ci_clusters}
6368
do
69+
echo "Processing cluster: $cluster_prefix"
6470
TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value)
6571
if [[ $TAGS =~ "keep" ]]; then
6672
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
6773
else
68-
yes | ./dev/delete-cluster.py ${cluster_prefix}
74+
./dev/delete-cluster.py ${cluster_prefix} --force
6975
fi
7076
done
7177
shell: bash

ansible/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,9 @@ roles/*
6060
!roles/tuned/**
6161
!roles/compute_init/
6262
!roles/compute_init/**
63+
!roles/k3s/
64+
!roles/k3s/**
65+
!roles/k9s/
66+
!roles/k9s/**
6367
!roles/lustre/
6468
!roles/lustre/**

ansible/bootstrap.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,3 +259,11 @@
259259
tasks:
260260
- include_role:
261261
name: azimuth_cloud.image_utils.linux_ansible_init
262+
263+
- hosts: k3s
264+
become: yes
265+
tags: k3s
266+
tasks:
267+
- ansible.builtin.include_role:
268+
name: k3s
269+
tasks_from: install.yml

ansible/cleanup.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
- name: Cleanup /tmp
4040
command : rm -rf /tmp/*
41-
41+
4242
- name: Get package facts
4343
package_facts:
4444

ansible/extras.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,12 @@
4343
become: yes
4444
tasks:
4545
- import_role:
46-
name: compute_init
46+
name: compute_init
47+
48+
- name: Install k9s
49+
become: yes
50+
hosts: k9s
51+
tags: k9s
52+
tasks:
53+
- import_role:
54+
name: k9s

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ data "openstack_identity_auth_scope_v3" "scope" {
77
name = "{{ cluster_name }}"
88
}
99

10+
####
11+
#### Data resources
12+
####
13+
14+
resource "terraform_data" "k3s_token" {
15+
input = "{{ k3s_token }}"
16+
lifecycle {
17+
ignore_changes = [
18+
input, # makes it a write-once value (set via Ansible)
19+
]
20+
}
21+
}
22+
1023
#####
1124
##### Security groups for the cluster
1225
#####
@@ -386,6 +399,8 @@ resource "openstack_compute_instance_v2" "login" {
386399
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
387400
{% endif %}
388401
{% endfor %}
402+
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
403+
k3s_token = "{{ k3s_token }}"
389404
}
390405
}
391406

@@ -400,6 +415,7 @@ resource "openstack_compute_instance_v2" "control" {
400415

401416
network {
402417
port = openstack_networking_port_v2.control.id
418+
access_network = true
403419
}
404420

405421
{% if cluster_storage_network is defined %}
@@ -455,7 +471,7 @@ resource "openstack_compute_instance_v2" "control" {
455471
{%- endif %}
456472
bootcmd:
457473
%{for volume in [openstack_blockstorage_volume_v3.state, {% if not cluster_home_manila_share | bool %} openstack_blockstorage_volume_v3.home {% endif %}]}
458-
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
474+
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${replace(substr(volume.id, 0, 20), "-", "*")}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
459475
%{endfor}
460476
mounts:
461477
- [LABEL=state, {{ appliances_state_dir }}, auto]
@@ -479,6 +495,7 @@ resource "openstack_compute_instance_v2" "control" {
479495
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
480496
{% endif %}
481497
{% endfor %}
498+
k3s_token = "{{ k3s_token }}"
482499
}
483500
}
484501

@@ -548,6 +565,8 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
548565
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
549566
{% endif %}
550567
{% endfor %}
568+
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
569+
k3s_token = "{{ k3s_token }}"
551570
}
552571
}
553572

ansible/roles/k3s/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
k3s
2+
=====
3+
4+
Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server
5+
(i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible).
6+
7+
8+
Requirements
9+
------------
10+
11+
`azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build.
12+
13+
Role Variables
14+
--------------
15+
16+
- `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/).
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed
2+
k3s_version: "v1.31.0+k3s1"
3+
k3s_selinux_release: v1.6.latest.1
4+
k3s_selinux_rpm_version: 1.6-1
5+
k3s_helm_version: v3.11.0
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
- hosts: localhost
2+
become: true
3+
vars:
4+
os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}"
5+
k3s_token: "{{ os_metadata.meta.k3s_token }}"
6+
k3s_server_name: "{{ os_metadata.meta.k3s_server }}"
7+
service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}"
8+
tasks:
9+
- name: Ensure password directory exists
10+
ansible.builtin.file:
11+
path: "/etc/rancher/node"
12+
state: directory
13+
14+
- name: Set agent node password as token # uses token to keep password consistent between reimages
15+
ansible.builtin.copy:
16+
dest: /etc/rancher/node/password
17+
content: "{{ k3s_token }}"
18+
19+
- name: Add the token for joining the cluster to the environment
20+
no_log: true # avoid logging the server token
21+
ansible.builtin.lineinfile:
22+
path: "/etc/systemd/system/{{ service_name }}.service.env"
23+
line: "K3S_TOKEN={{ k3s_token }}"
24+
25+
- name: Add server url to agents
26+
ansible.builtin.lineinfile:
27+
path: "/etc/systemd/system/{{ service_name }}.service.env"
28+
line: "K3S_URL=https://{{ k3s_server_name }}:6443"
29+
when: k3s_server_name is defined
30+
31+
- name: Start k3s service
32+
ansible.builtin.systemd:
33+
name: "{{ service_name }}"
34+
daemon_reload: true
35+
state: started
36+
enabled: true
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
---
2+
3+
- name: Check for existing k3s installation
4+
stat:
5+
path: /var/lib/rancher/k3s
6+
register: stat_result
7+
8+
- name: Perform air-gapped installation of k3s
9+
# Using air-gapped install so containers are pre-installed to avoid rate-limiting from registries on cluster startup
10+
when: not stat_result.stat.exists
11+
block:
12+
13+
- name: Download k3s binary
14+
ansible.builtin.get_url:
15+
url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s"
16+
dest: /usr/bin/k3s
17+
owner: root
18+
group: root
19+
mode: "0755"
20+
21+
- name: Install k3s SELinux policy package
22+
yum:
23+
name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm"
24+
disable_gpg_check: true
25+
26+
- name: Create image directory
27+
ansible.builtin.file:
28+
path: "/var/lib/rancher/k3s/agent/images"
29+
state: directory
30+
31+
- name: Install k3s' internal images
32+
ansible.builtin.get_url:
33+
url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst"
34+
dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst
35+
36+
- name: Download k3s install script
37+
ansible.builtin.get_url:
38+
url: https://get.k3s.io/
39+
timeout: 120
40+
dest: /usr/bin/k3s-install.sh
41+
owner: root
42+
group: root
43+
mode: "0755"
44+
45+
- name: Install k3s
46+
ansible.builtin.shell:
47+
cmd: /usr/bin/k3s-install.sh
48+
environment:
49+
INSTALL_K3S_VERSION: "{{ k3s_version }}"
50+
INSTALL_K3S_EXEC: "{{ item }}"
51+
INSTALL_K3S_SKIP_START: "true"
52+
INSTALL_K3S_SKIP_ENABLE: "true"
53+
INSTALL_K3S_BIN_DIR: "/usr/bin"
54+
INSTALL_K3S_SKIP_DOWNLOAD: "true"
55+
changed_when: true
56+
loop:
57+
- server --disable=traefik
58+
- agent
59+
60+
- name: Install helm
61+
unarchive:
62+
src: "https://get.helm.sh/helm-{{ k3s_helm_version }}-linux-amd64.tar.gz"
63+
dest: /usr/bin
64+
extra_opts: "--strip-components=1"
65+
owner: root
66+
group: root
67+
mode: 0755
68+
remote_src: true
69+
70+
- name: Add k3s kubeconfig as environment variable
71+
ansible.builtin.lineinfile:
72+
path: /etc/environment
73+
line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"
74+
75+
- name: Install ansible-init playbook for k3s agent or server activation
76+
copy:
77+
src: start_k3s.yml
78+
dest: /etc/ansible-init/playbooks/0-start-k3s.yml

0 commit comments

Comments
 (0)