diff --git a/ansible/.gitignore b/ansible/.gitignore index f6f5c5f4d..8edcc4360 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,5 +58,9 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/k3s/ +!roles/k3s/** +!roles/k9s/ +!roles/k9s/** !roles/lustre/ !roles/lustre/** diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 18d159996..733d4b3f8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -259,3 +259,11 @@ tasks: - include_role: name: azimuth_cloud.image_utils.linux_ansible_init + +- hosts: k3s + become: yes + tags: k3s + tasks: + - ansible.builtin.include_role: + name: k3s + tasks_from: install.yml diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 9c1373667..cf9b0bdab 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -38,7 +38,7 @@ - name: Cleanup /tmp command : rm -rf /tmp/* - + - name: Get package facts package_facts: diff --git a/ansible/extras.yml b/ansible/extras.yml index c32f51c32..107f85252 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -36,3 +36,11 @@ tasks: - import_role: name: persist_hostkeys + +- name: Install k9s + become: yes + hosts: k9s + tags: k9s + tasks: + - import_role: + name: k9s diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 344137b62..cab7cc7a2 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -7,6 +7,19 @@ data "openstack_identity_auth_scope_v3" "scope" { name = "{{ cluster_name }}" } +#### +#### Data resources +#### + +resource "terraform_data" "k3s_token" { + input = "{{ k3s_token }}" + lifecycle { + ignore_changes = [ + input, # makes it a write-once value (set via Ansible) + ] + } +} + ##### ##### Security groups for the cluster ##### @@ -386,6 +399,8 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} + k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + k3s_token = "{{ k3s_token }}" } } @@ -400,6 +415,7 @@ resource "openstack_compute_instance_v2" "control" { network { port = openstack_networking_port_v2.control.id + access_network = true } {% if cluster_storage_network is defined %} @@ -479,6 +495,7 @@ resource "openstack_compute_instance_v2" "control" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} + k3s_token = "{{ k3s_token }}" } } @@ -548,6 +565,8 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} + k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + k3s_token = "{{ k3s_token }}" } } diff --git a/ansible/roles/k3s/README.md b/ansible/roles/k3s/README.md new file mode 100644 index 000000000..68e8e2410 --- /dev/null +++ b/ansible/roles/k3s/README.md @@ -0,0 +1,16 @@ +k3s +===== + +Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server +(i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible). + + +Requirements +------------ + +`azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build. + +Role Variables +-------------- + +- `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/). diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml new file mode 100644 index 000000000..ba9a1a899 --- /dev/null +++ b/ansible/roles/k3s/defaults/main.yml @@ -0,0 +1,5 @@ +# Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed +k3s_version: "v1.31.0+k3s1" +k3s_selinux_release: v1.6.latest.1 +k3s_selinux_rpm_version: 1.6-1 +k3s_helm_version: v3.11.0 diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml new file mode 100644 index 000000000..8ee0e6114 --- /dev/null +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -0,0 +1,36 @@ +- hosts: localhost + become: true + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + k3s_token: "{{ os_metadata.meta.k3s_token }}" + k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" + tasks: + - name: Ensure password directory exists + ansible.builtin.file: + path: "/etc/rancher/node" + state: directory + + - name: Set agent node password as token # uses token to keep password consistent between reimages + ansible.builtin.copy: + dest: /etc/rancher/node/password + content: "{{ k3s_token }}" + + - name: Add the token for joining the cluster to the environment + no_log: true # avoid logging the server token + ansible.builtin.lineinfile: + path: "/etc/systemd/system/{{ service_name }}.service.env" + line: "K3S_TOKEN={{ k3s_token }}" + + - name: Add server url to agents + ansible.builtin.lineinfile: + path: "/etc/systemd/system/{{ service_name }}.service.env" + line: "K3S_URL=https://{{ k3s_server_name }}:6443" + when: k3s_server_name is defined + + - name: Start k3s service + ansible.builtin.systemd: + name: "{{ service_name }}" + daemon_reload: true + state: started + enabled: true diff --git a/ansible/roles/k3s/tasks/install.yml b/ansible/roles/k3s/tasks/install.yml new file mode 100644 index 000000000..77b95a509 --- /dev/null +++ b/ansible/roles/k3s/tasks/install.yml @@ -0,0 +1,78 @@ +--- + +- name: Check for existing k3s installation + stat: + path: /var/lib/rancher/k3s + register: stat_result + +- name: Perform air-gapped installation of k3s + # Using air-gapped install so containers are pre-installed to avoid rate-limiting from registries on cluster startup + when: not stat_result.stat.exists + block: + + - name: Download k3s binary + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s" + dest: /usr/bin/k3s + owner: root + group: root + mode: "0755" + + - name: Install k3s SELinux policy package + yum: + name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm" + disable_gpg_check: true + + - name: Create image directory + ansible.builtin.file: + path: "/var/lib/rancher/k3s/agent/images" + state: directory + + - name: Install k3s' internal images + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst" + dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst + + - name: Download k3s install script + ansible.builtin.get_url: + url: https://get.k3s.io/ + timeout: 120 + dest: /usr/bin/k3s-install.sh + owner: root + group: root + mode: "0755" + + - name: Install k3s + ansible.builtin.shell: + cmd: /usr/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "{{ k3s_version }}" + INSTALL_K3S_EXEC: "{{ item }}" + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" + INSTALL_K3S_BIN_DIR: "/usr/bin" + INSTALL_K3S_SKIP_DOWNLOAD: "true" + changed_when: true + loop: + - server --disable=traefik + - agent + +- name: Install helm + unarchive: + src: "https://get.helm.sh/helm-{{ k3s_helm_version }}-linux-amd64.tar.gz" + dest: /usr/bin + extra_opts: "--strip-components=1" + owner: root + group: root + mode: 0755 + remote_src: true + +- name: Add k3s kubeconfig as environment variable + ansible.builtin.lineinfile: + path: /etc/environment + line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" + +- name: Install ansible-init playbook for k3s agent or server activation + copy: + src: start_k3s.yml + dest: /etc/ansible-init/playbooks/0-start-k3s.yml diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml new file mode 100644 index 000000000..674b4dffb --- /dev/null +++ b/ansible/roles/k9s/tasks/main.yml @@ -0,0 +1,44 @@ +--- + + - name: Check if k9s is installed + ansible.builtin.stat: + path: "/usr/bin/k9s" + register: _k9s_stat_result + + - name: Install k9s and clean up temporary files + block: + - name: Create install directory + ansible.builtin.file: + path: /tmp/k9s + state: directory + owner: root + group: root + mode: "744" + when: not _k9s_stat_result.stat.exists + + - name: Download k9s + ansible.builtin.get_url: + url: https://github.com/derailed/k9s/releases/download/v0.32.5/k9s_Linux_amd64.tar.gz + dest: /tmp/k9s/k9s_Linux_amd64.tar.gz + owner: root + group: root + mode: "744" + + - name: Unpack k9s binary + ansible.builtin.unarchive: + src: /tmp/k9s/k9s_Linux_amd64.tar.gz + dest: /tmp/k9s + remote_src: yes + + - name: Add k9s to root path + ansible.builtin.copy: + src: /tmp/k9s/k9s + dest: /usr/bin/k9s + mode: u+rwx + remote_src: yes + + - name: Cleanup k9s install directory + ansible.builtin.file: + path: /tmp/k9s + state: absent + when: not _k9s_stat_result.stat.exists diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 55680ae37..d9a339efd 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -8,6 +8,7 @@ slurm_appliance_secrets: vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}" vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" + vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index 21b10f780..d67dc84ea 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -7,14 +7,14 @@ delegate_to: localhost run_once: true -# - name: Ensure munge key directory exists -# file: -# state: directory -# recurse: true -# path: "{{ openhpc_passwords_mungekey_output_path | dirname }}" +- name: Get templated passwords from target environment +# inventory group/host vars created in a play cannot be accessed in the same play, even after meta: refresh_inventory + ansible.builtin.include_vars: + file: "{{ openhpc_passwords_output_path }}" -# - name: Create a munge key -# copy: -# content: "{{ lookup('password', '/dev/null chars=ascii_letters,digits,hexdigits,punctuation') }}" -# dest: "{{ openhpc_passwords_mungekey_output_path }}" -# force: false \ No newline at end of file +- name: Template k3s token to terraform + template: + src: k3s-token.auto.tfvars.json.j2 + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/k3s-token.auto.tfvars.json" + delegate_to: localhost + run_once: true diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 new file mode 100644 index 000000000..2a8fabba8 --- /dev/null +++ b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 @@ -0,0 +1,3 @@ +{ + "k3s_token": "{{ vault_k3s_token }}" +} \ No newline at end of file diff --git a/docs/k3s.README.md b/docs/k3s.README.md new file mode 100644 index 000000000..1b6651159 --- /dev/null +++ b/docs/k3s.README.md @@ -0,0 +1,8 @@ +# Overview +A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be +enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has +access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user. + +# Idempotency +K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't +idempotent and changes to variables will not be reflected in the image when running `site.yml`. diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 05b0255c8..c8951a4af 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,5 +1,13 @@ --- +# Generate k3s token +- name: Generate k3s token + # NB: Although this generates a new token on each run, the actual token set in metadata is retrieved from a set-once tofu resource, hence only the first value ever generated is relevant. + hosts: openstack + tasks: + - ansible.builtin.set_fact: + k3s_token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" + # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 87f5c46cd..f9a2087c8 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241115-1209-097cdae1", - "RL9": "openhpc-RL9-241115-1209-097cdae1" + "RL8": "openhpc-RL8-241118-0918-4538c6df", + "RL9": "openhpc-RL9-241118-0918-4538c6df" } } diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index ad13ae45d..4284ec132 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -54,6 +54,10 @@ variable "volume_backed_instances" { default = false } +variable "k3s_token" { + type = string +} + data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -69,6 +73,7 @@ module "cluster" { key_pair = "slurm-app-ci" cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor + k3s_token = var.k3s_token login_nodes = { login-0: var.other_node_flavor diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index cd9a5cb0c..9b9aa5bf0 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -136,5 +136,11 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init +[k3s] +# Hosts to run k3s server/agent + +[k9s] +# Hosts to install k9s on + [lustre] # Hosts to run lustre client diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index be0b3d1b7..ba5cbc08d 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -82,5 +82,13 @@ openhpc # Hosts to run ansible-init cluster +[k3s:children] +# Hosts to run k3s server/agent +openhpc + +[k9s:children] +# Hosts to install k9s on +control + [lustre] # Hosts to run lustre client diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index c8907c836..eb2139eba 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -15,5 +15,7 @@ module "compute" { vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) key_pair = var.key_pair environment_root = var.environment_root + k3s_token = var.k3s_token + k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 006f802c7..e64a2162c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -46,6 +46,8 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root + k3s_token = var.k3s_token + k3s_server = var.k3s_server } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 5696a94eb..9d2c2e47c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -67,3 +67,12 @@ variable "root_volume_size" { variable "security_group_ids" { type = list } + +variable "k3s_token" { + type = string +} + +variable "k3s_server" { + description = "Name/address of k3s server" + type = string +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 376d3da0e..bfbd1c532 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -76,6 +76,7 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root + k3s_token = var.k3s_token } user_data = <<-EOF @@ -124,6 +125,8 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root + k3s_token = var.k3s_token + k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 289de3fef..0f5eefa18 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -131,3 +131,8 @@ variable "root_volume_size" { type = number default = 40 } + +variable "k3s_token" { + description = "K3s cluster authentication token, set automatically by Ansible" + type = string +}