From 0b69d4b643ec4f918f1ddb3ac842ed418a104f4d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 19 Feb 2025 09:28:00 +0000 Subject: [PATCH 01/14] now uses bootstrap tokens instead of cloud-init metadata --- ansible/bootstrap.yml | 2 +- ansible/roles/k3s/defaults/main.yml | 1 + ansible/roles/k3s/files/start_k3s.yml | 44 ------------- ansible/roles/k3s/tasks/install.yml | 5 -- ansible/roles/k3s/tasks/runtime.yml | 64 +++++++++++++++++++ .../k3s/templates/k3s-agent.service.env.j2 | 3 + .../roles/k3s/templates/k3s.service.env.j2 | 1 + ansible/roles/passwords/defaults/main.yml | 2 +- .../templates/k3s-token.auto.tfvars.json.j2 | 3 - environments/common/inventory/groups | 10 ++- environments/common/layouts/everything | 11 +++- .../tofu/compute.tf | 1 - .../tofu/control.tf | 1 - .../tofu/login.tf | 1 - .../tofu/node_group/nodes.tf | 2 - .../tofu/node_group/variables.tf | 4 -- .../tofu/variables.tf | 4 -- 17 files changed, 88 insertions(+), 71 deletions(-) delete mode 100644 ansible/roles/k3s/files/start_k3s.yml create mode 100644 ansible/roles/k3s/tasks/runtime.yml create mode 100644 ansible/roles/k3s/templates/k3s-agent.service.env.j2 create mode 100644 ansible/roles/k3s/templates/k3s.service.env.j2 delete mode 100644 ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 32aa51f76..f9d1fb887 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -312,4 +312,4 @@ tasks: - ansible.builtin.include_role: name: k3s - tasks_from: install.yml + tasks_from: "{{ 'install.yml' if 'builder' in group_names else 'runtime.yml' }}" diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index ba9a1a899..2700e1ee9 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -3,3 +3,4 @@ k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 +k3s_bootstrap_token_expiry: 5m diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml deleted file mode 100644 index 4ff72bcd8..000000000 --- a/ansible/roles/k3s/files/start_k3s.yml +++ /dev/null @@ -1,44 +0,0 @@ -- hosts: localhost - become: true - vars: - os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.control_address }}" - service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" - access_ip: "{{ os_metadata.meta.access_ip }}" - tasks: - - name: Ensure password directory exists - ansible.builtin.file: - path: "/etc/rancher/node" - state: directory - - - name: Set agent node password as token # uses token to keep password consistent between reimages - ansible.builtin.copy: - dest: /etc/rancher/node/password - content: "{{ k3s_token }}" - - - name: Add the token for joining the cluster to the environment - no_log: true # avoid logging the server token - ansible.builtin.lineinfile: - path: "/etc/systemd/system/{{ service_name }}.service.env" - line: "K3S_TOKEN={{ k3s_token }}" - - - name: Add the node IP to the environment - # NB this isn't natively setable via envvars, have to modify - # INSTALL_K3S_EXEC to support it - ansible.builtin.lineinfile: - path: "/etc/systemd/system/{{ service_name }}.service.env" - line: "K3S_NODE_IP={{ access_ip }}" - - - name: Add server url to agents - ansible.builtin.lineinfile: - path: "/etc/systemd/system/{{ service_name }}.service.env" - line: "K3S_URL=https://{{ k3s_server_name }}:6443" - when: k3s_server_name is defined - - - name: Start k3s service - ansible.builtin.systemd: - name: "{{ service_name }}" - daemon_reload: true - state: started - enabled: true diff --git a/ansible/roles/k3s/tasks/install.yml b/ansible/roles/k3s/tasks/install.yml index 579a75bc4..c250f87a8 100644 --- a/ansible/roles/k3s/tasks/install.yml +++ b/ansible/roles/k3s/tasks/install.yml @@ -71,8 +71,3 @@ ansible.builtin.lineinfile: path: /etc/environment line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" - -- name: Install ansible-init playbook for k3s agent or server activation - copy: - src: start_k3s.yml - dest: /etc/ansible-init/playbooks/0-start-k3s.yml diff --git a/ansible/roles/k3s/tasks/runtime.yml b/ansible/roles/k3s/tasks/runtime.yml new file mode 100644 index 000000000..4d88fabc0 --- /dev/null +++ b/ansible/roles/k3s/tasks/runtime.yml @@ -0,0 +1,64 @@ +--- +- name: Check if k3s agents are already connected + service_facts: + register: services_state + +- name: Initialise and authenticate k3s server and agents + vars: + k3s_server_name: "{{ hostvars[groups['k3s_server'].0].ansible_host }}" + access_ip: "{{ ansible_default_ipv4.address }}" + services_states: > # getting list of all unique agent service states + groups['k3s_agent'] + | map('extract', hostvars, ['services', 'k3s-agent.service', 'state']) + | unique + when: not (services_state | length == 1 and services_state[0] == 'running') + block: + - name: Initialise server and generate bootstrap tokens + when: inventory_hostname in groups['k3s_server'] + block: + - name: Template k3s env file + ansible.builtin.template: + dest: /etc/systemd/system/k3s.service.env + src: k3s.service.env.j2 + + - name: Start k3s server + ansible.builtin.systemd: + name: k3s + daemon_reload: true + state: started + enabled: true + + - name: Generate bootstrap token + no_log: true + shell: + cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" + register: _token_output + + - name: Initialise agents + when: inventory_hostname in groups['k3s_agent'] + block: + - name: Template k3s agent env file + ansible.builtin.template: + dest: /etc/systemd/system/k3s-agent.service.env + src: k3s-agent.service.env.j2 + + - name: Ensure password directory exists + ansible.builtin.file: + path: "/etc/rancher/node" + state: directory + + - name: Write node password + ansible.builtin.copy: + dest: /etc/rancher/node/password + content: "{{ vault_k3s_node_password }}" + owner: root + group: root + mode: 640 # normal k3s install is 644 but that doesn't feel right + + - name: Start k3s agent + ansible.builtin.systemd: + name: k3s-agent + daemon_reload: true + state: started + enabled: true + \ No newline at end of file diff --git a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 new file mode 100644 index 000000000..7064af4dd --- /dev/null +++ b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 @@ -0,0 +1,3 @@ +K3S_NODE_IP={{ access_ip }} +K3S_TOKEN={{ hostvars[groups['control'] | first]._token_output.stdout }} +K3S_URL=https://{{ k3s_server_name }}:6443 diff --git a/ansible/roles/k3s/templates/k3s.service.env.j2 b/ansible/roles/k3s/templates/k3s.service.env.j2 new file mode 100644 index 000000000..3ed552ea3 --- /dev/null +++ b/ansible/roles/k3s/templates/k3s.service.env.j2 @@ -0,0 +1 @@ +K3S_NODE_IP={{ access_ip }} diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 929aac465..0dbe66dd8 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -8,7 +8,7 @@ slurm_appliance_secrets: vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}" vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" - vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" + vault_k3s_node_password: "{{ vault_k3s_node_password | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 deleted file mode 100644 index 2a8fabba8..000000000 --- a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 +++ /dev/null @@ -1,3 +0,0 @@ -{ - "k3s_token": "{{ vault_k3s_token }}" -} \ No newline at end of file diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ad9e1cd22..73c210bcf 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -145,8 +145,16 @@ freeipa_client [compute_init] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on -[k3s] +[k3s:children] # Hosts to run k3s server/agent +k3s_server +k3s_agent + +[k3s_server] +# Hosts to run k3s server (should only be single node i.e control node) + +[k3s_agent] +# Hosts to run k3s agent [k9s] # Hosts to install k9s on diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index c5d9a0471..ca4c11dfb 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -97,9 +97,14 @@ cluster # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on compute -[k3s:children] -# Hosts to run k3s server/agent -openhpc +[k3s_server:children] +# Hosts to run k3s server (should only be single node i.e control node) +control + +[k3s_agent:children] +# Hosts to run k3s agent +compute +login [k9s:children] # Hosts to install k9s on diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf index fe614a101..555b9858d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf @@ -29,7 +29,6 @@ module "compute" { availability_zone = lookup(each.value, "availability_zone", "nova") # computed - k3s_token = local.k3s_token # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf index 0876ce00a..9c987c02f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf @@ -59,7 +59,6 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root - k3s_token = local.k3s_token access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf index d36c099b2..8ce8115a8 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf @@ -33,7 +33,6 @@ module "login" { ignore_image_changes = false # computed - k3s_token = local.k3s_token # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index 305b89e62..d196928ab 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -85,7 +85,6 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { metadata = merge( { environment_root = var.environment_root - k3s_token = var.k3s_token control_address = var.control_address access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] }, @@ -139,7 +138,6 @@ resource "openstack_compute_instance_v2" "compute" { metadata = merge( { environment_root = var.environment_root - k3s_token = var.k3s_token control_address = var.control_address access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] }, diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 8a3f03876..05cbf286f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -70,10 +70,6 @@ variable "security_group_ids" { type = list } -variable "k3s_token" { - type = string -} - variable "control_address" { description = "Name/address of control node" type = string diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index a6f398d09..503541648 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -189,7 +189,3 @@ variable "inventory_secrets_path" { type = string default = "" } - -locals { - k3s_token = data.external.inventory_secrets.result["vault_k3s_token"] -} From bfe9cafd79b34eff244f21e317ade207e00fca24 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 19 Feb 2025 09:36:40 +0000 Subject: [PATCH 02/14] bumped token timeout --- ansible/roles/k3s/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index 2700e1ee9..f992fb687 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -3,4 +3,4 @@ k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 -k3s_bootstrap_token_expiry: 5m +k3s_bootstrap_token_expiry: 20m From 28b42054b434d0dab6d2221242cf084568aa21c4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 19 Feb 2025 10:00:46 +0000 Subject: [PATCH 03/14] fixed assuming default route --- ansible/roles/k3s/tasks/runtime.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/k3s/tasks/runtime.yml b/ansible/roles/k3s/tasks/runtime.yml index 4d88fabc0..c93c82984 100644 --- a/ansible/roles/k3s/tasks/runtime.yml +++ b/ansible/roles/k3s/tasks/runtime.yml @@ -6,7 +6,7 @@ - name: Initialise and authenticate k3s server and agents vars: k3s_server_name: "{{ hostvars[groups['k3s_server'].0].ansible_host }}" - access_ip: "{{ ansible_default_ipv4.address }}" + access_ip: "{{ ansible_host }}" services_states: > # getting list of all unique agent service states groups['k3s_agent'] | map('extract', hostvars, ['services', 'k3s-agent.service', 'state']) From aeeca4c3e55452f54b3e3dcd2aaf9004724315f1 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 19 Feb 2025 10:55:19 +0000 Subject: [PATCH 04/14] bump image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 99bca2f54..c73453323 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250211-1540-a0b4a57e", - "RL9": "openhpc-RL9-250211-1540-a0b4a57e" + "RL8": "openhpc-RL8-250219-1007-bfe9cafd", + "RL9": "openhpc-RL9-250219-1008-bfe9cafd" } } From e4ff694ea0ef618f0222881a5d6340c8a8a2479a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 21 Feb 2025 08:58:10 +0000 Subject: [PATCH 05/14] refactored to separate agent and server runtimes + fixes + review comments --- ansible/bootstrap.yml | 28 +++++++- ansible/roles/k3s/defaults/main.yml | 4 +- ansible/roles/k3s/tasks/agent-runtime.yml | 32 ++++++++++ ansible/roles/k3s/tasks/runtime.yml | 64 ------------------- ansible/roles/k3s/tasks/server-runtime.yml | 31 +++++++++ .../k3s/templates/k3s-agent.service.env.j2 | 4 +- .../roles/k3s/templates/k3s.service.env.j2 | 2 +- .../inventory/group_vars/all/defaults.yml | 1 + .../{{cookiecutter.environment}}/tofu/data.tf | 8 --- .../tofu/variables.tf | 6 -- 10 files changed, 96 insertions(+), 84 deletions(-) create mode 100644 ansible/roles/k3s/tasks/agent-runtime.yml delete mode 100644 ansible/roles/k3s/tasks/runtime.yml create mode 100644 ansible/roles/k3s/tasks/server-runtime.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index f9d1fb887..fdcd5e318 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -310,6 +310,30 @@ become: yes tags: k3s tasks: - - ansible.builtin.include_role: + - name: Install k3s + when: "'builder' in group_names" + ansible.builtin.include_role: + name: k3s + tasks_from: install.yml + +- hosts: k3s_server + become: yes + tags: k3s + tasks: + - name: Start k3s server + when: "'builder' not in group_names" + ansible.builtin.include_role: + name: k3s + tasks_from: server-runtime.yml + +- hosts: k3s_agent + become: yes + tags: k3s + tasks: + - name: Start k3s agents + when: "'builder' not in group_names" + vars: # set outside of role to allow compute init to define own value + k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first]._k3s_token_output.stdout | default('') }}" + ansible.builtin.include_role: name: k3s - tasks_from: "{{ 'install.yml' if 'builder' in group_names else 'runtime.yml' }}" + tasks_from: agent-runtime.yml diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index f992fb687..b7c5e91c9 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -3,4 +3,6 @@ k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 -k3s_bootstrap_token_expiry: 20m +k3s_bootstrap_token: "{{ None }}" # ansible managed +k3s_bootstrap_token_expiry: 10m +k3s_server_name: "{{ None }}" # ansible managed diff --git a/ansible/roles/k3s/tasks/agent-runtime.yml b/ansible/roles/k3s/tasks/agent-runtime.yml new file mode 100644 index 000000000..3a961e898 --- /dev/null +++ b/ansible/roles/k3s/tasks/agent-runtime.yml @@ -0,0 +1,32 @@ +--- + +- name: Template k3s agent env file + when: k3s_bootstrap_token != "" + ansible.builtin.template: + dest: /etc/systemd/system/k3s-agent.service.env + src: k3s-agent.service.env.j2 + register: _k3s_agent_token_result + +- name: Ensure password directory exists + ansible.builtin.file: + path: "/etc/rancher/node" + state: directory + owner: root + group: root + mode: 0640 + +- name: Write node password + ansible.builtin.copy: + dest: /etc/rancher/node/password + content: "{{ vault_k3s_node_password }}" + owner: root + group: root + mode: 0640 # normal k3s install is 644 but that doesn't feel right + +- name: Start/restart k3s agent + when: _k3s_agent_token_result.changed + ansible.builtin.systemd: + name: k3s-agent + daemon_reload: true + state: restarted + enabled: true diff --git a/ansible/roles/k3s/tasks/runtime.yml b/ansible/roles/k3s/tasks/runtime.yml deleted file mode 100644 index c93c82984..000000000 --- a/ansible/roles/k3s/tasks/runtime.yml +++ /dev/null @@ -1,64 +0,0 @@ ---- -- name: Check if k3s agents are already connected - service_facts: - register: services_state - -- name: Initialise and authenticate k3s server and agents - vars: - k3s_server_name: "{{ hostvars[groups['k3s_server'].0].ansible_host }}" - access_ip: "{{ ansible_host }}" - services_states: > # getting list of all unique agent service states - groups['k3s_agent'] - | map('extract', hostvars, ['services', 'k3s-agent.service', 'state']) - | unique - when: not (services_state | length == 1 and services_state[0] == 'running') - block: - - name: Initialise server and generate bootstrap tokens - when: inventory_hostname in groups['k3s_server'] - block: - - name: Template k3s env file - ansible.builtin.template: - dest: /etc/systemd/system/k3s.service.env - src: k3s.service.env.j2 - - - name: Start k3s server - ansible.builtin.systemd: - name: k3s - daemon_reload: true - state: started - enabled: true - - - name: Generate bootstrap token - no_log: true - shell: - cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" - register: _token_output - - - name: Initialise agents - when: inventory_hostname in groups['k3s_agent'] - block: - - name: Template k3s agent env file - ansible.builtin.template: - dest: /etc/systemd/system/k3s-agent.service.env - src: k3s-agent.service.env.j2 - - - name: Ensure password directory exists - ansible.builtin.file: - path: "/etc/rancher/node" - state: directory - - - name: Write node password - ansible.builtin.copy: - dest: /etc/rancher/node/password - content: "{{ vault_k3s_node_password }}" - owner: root - group: root - mode: 640 # normal k3s install is 644 but that doesn't feel right - - - name: Start k3s agent - ansible.builtin.systemd: - name: k3s-agent - daemon_reload: true - state: started - enabled: true - \ No newline at end of file diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml new file mode 100644 index 000000000..fd5a3ae37 --- /dev/null +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -0,0 +1,31 @@ +--- + +- name: Template k3s env file + ansible.builtin.template: + dest: /etc/systemd/system/k3s.service.env + src: k3s.service.env.j2 + register: _k3s_env_file_status + +- name: Start k3s server + ansible.builtin.systemd: + name: k3s + daemon_reload: "{{ _k3s_env_file_status.changed }}" + state: started + enabled: true + +# Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though +- name: Check if k3s agents are connected + ignore_errors: true + ansible.builtin.shell: + cmd: kubectl get nodes --no-headers | grep -w Ready + register: _k3s_connected_nodes + retries: 5 # there may be a delay before the server reconnects to itself + delay: 10 + until: not _k3s_connected_nodes.failed + +- name: Generate new bootstrap token + no_log: true + when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length + shell: + cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" + register: _k3s_token_output diff --git a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 index 7064af4dd..b994b0680 100644 --- a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 +++ b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 @@ -1,3 +1,3 @@ -K3S_NODE_IP={{ access_ip }} -K3S_TOKEN={{ hostvars[groups['control'] | first]._token_output.stdout }} +K3S_NODE_IP={{ ansible_host }} +K3S_TOKEN={{ k3s_bootstrap_token }} K3S_URL=https://{{ k3s_server_name }}:6443 diff --git a/ansible/roles/k3s/templates/k3s.service.env.j2 b/ansible/roles/k3s/templates/k3s.service.env.j2 index 3ed552ea3..746e6d809 100644 --- a/ansible/roles/k3s/templates/k3s.service.env.j2 +++ b/ansible/roles/k3s/templates/k3s.service.env.j2 @@ -1 +1 @@ -K3S_NODE_IP={{ access_ip }} +K3S_NODE_IP={{ ansible_host }} diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 23aafd73e..d96eed501 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -21,6 +21,7 @@ opensearch_address: "127.0.0.1" prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}" +k3s_server_name: "{{ hostvars[groups['k3s_server'] | first].ansible_host }}" ############################# bootstrap: local user configuration ######################### diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf index 9c6ba76c3..443c52282 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf @@ -1,11 +1,3 @@ -data "external" "inventory_secrets" { - program = ["${path.module}/read-inventory-secrets.py"] - - query = { - path = var.inventory_secrets_path == "" ? "${path.module}/../inventory/group_vars/all/secrets.yml" : var.inventory_secrets_path - } -} - data "external" "baremetal_nodes" { # returns an empty map if cannot list baremetal nodes program = ["${path.module}/baremetal-node-list.py"] diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 503541648..e3e5b154d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -183,9 +183,3 @@ variable "root_volume_size" { type = number default = 40 } - -variable "inventory_secrets_path" { - description = "Path to inventory secrets.yml file. Default is standard cookiecutter location." - type = string - default = "" -} From 178f853b1f7df132279ea08c6b935e47de431241 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 21 Feb 2025 10:12:32 +0000 Subject: [PATCH 06/14] removed inventory_secrets from ci --- environments/.stackhpc/tofu/main.tf | 3 --- 1 file changed, 3 deletions(-) diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index 4a27cbf7a..209db38f8 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -66,9 +66,6 @@ module "cluster" { key_pair = "slurm-app-ci" cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor - # have to override default, as unusually the actual module path and secrets - # are not in the same environment for stackhpc - inventory_secrets_path = "${path.module}/../inventory/group_vars/all/secrets.yml" login = { login: { From 3cc46b8813de1b8144173a0f7d9f6134fc0be657 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 21 Feb 2025 10:14:02 +0000 Subject: [PATCH 07/14] image bump --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index c73453323..0bd6001dc 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250219-1007-bfe9cafd", - "RL9": "openhpc-RL9-250219-1008-bfe9cafd" + "RL8": "openhpc-RL8-250221-0904-e4ff694e", + "RL9": "openhpc-RL9-250221-0904-e4ff694e" } } From 5e9541e3bd9cd45ca4b31492a064566c35d90303 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 25 Feb 2025 09:18:53 +0000 Subject: [PATCH 08/14] testing moving agent runtime after nfs --- ansible/bootstrap.yml | 22 ---------------------- ansible/extras.yml | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index ca4ba0bec..681f631bb 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -322,25 +322,3 @@ ansible.builtin.include_role: name: k3s tasks_from: install.yml - -- hosts: k3s_server - become: yes - tags: k3s - tasks: - - name: Start k3s server - when: "'builder' not in group_names" - ansible.builtin.include_role: - name: k3s - tasks_from: server-runtime.yml - -- hosts: k3s_agent - become: yes - tags: k3s - tasks: - - name: Start k3s agents - when: "'builder' not in group_names" - vars: # set outside of role to allow compute init to define own value - k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first]._k3s_token_output.stdout | default('') }}" - ansible.builtin.include_role: - name: k3s - tasks_from: agent-runtime.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index 72c76b3b1..a04dea2c2 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,3 +1,25 @@ +- hosts: k3s_server + become: yes + tags: k3s + tasks: + - name: Start k3s server + when: "'builder' not in group_names" + ansible.builtin.include_role: + name: k3s + tasks_from: server-runtime.yml + +- hosts: k3s_agent + become: yes + tags: k3s + tasks: + - name: Start k3s agents + when: "'builder' not in group_names" + vars: # set outside of role to allow compute init to define own value + k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first]._k3s_token_output.stdout | default('') }}" + ansible.builtin.include_role: + name: k3s + tasks_from: agent-runtime.yml + - hosts: basic_users:!builder become: yes tags: From 3508ebdc65db33c30bdd0e77e4bc51311877da27 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 25 Feb 2025 11:38:48 +0000 Subject: [PATCH 09/14] changed k3s conditionals to host patterns + server tweaks --- ansible/bootstrap.yml | 3 +-- ansible/extras.yml | 6 ++---- ansible/roles/k3s/tasks/server-runtime.yml | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 681f631bb..5b873fb31 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -313,12 +313,11 @@ - include_role: name: azimuth_cloud.image_utils.linux_ansible_init -- hosts: k3s +- hosts: k3s:&builder become: yes tags: k3s tasks: - name: Install k3s - when: "'builder' in group_names" ansible.builtin.include_role: name: k3s tasks_from: install.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index a04dea2c2..f97497b3c 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,19 +1,17 @@ -- hosts: k3s_server +- hosts: k3s_server:!builder become: yes tags: k3s tasks: - name: Start k3s server - when: "'builder' not in group_names" ansible.builtin.include_role: name: k3s tasks_from: server-runtime.yml -- hosts: k3s_agent +- hosts: k3s_agent:!builder become: yes tags: k3s tasks: - name: Start k3s agents - when: "'builder' not in group_names" vars: # set outside of role to allow compute init to define own value k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first]._k3s_token_output.stdout | default('') }}" ansible.builtin.include_role: diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml index fd5a3ae37..bdb65c17a 100644 --- a/ansible/roles/k3s/tasks/server-runtime.yml +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -15,11 +15,10 @@ # Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though - name: Check if k3s agents are connected - ignore_errors: true ansible.builtin.shell: cmd: kubectl get nodes --no-headers | grep -w Ready register: _k3s_connected_nodes - retries: 5 # there may be a delay before the server reconnects to itself + retries: 6 # there may be a delay before the server reconnects to itself delay: 10 until: not _k3s_connected_nodes.failed From bd719747913ad66d1670cb50e349fdfa68c9fc88 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 25 Feb 2025 16:26:37 +0000 Subject: [PATCH 10/14] comment suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/k3s/tasks/server-runtime.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml index bdb65c17a..77eb5c980 100644 --- a/ansible/roles/k3s/tasks/server-runtime.yml +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -14,15 +14,15 @@ enabled: true # Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though -- name: Check if k3s agents are connected +- name: Check which k3s agents are connected ansible.builtin.shell: cmd: kubectl get nodes --no-headers | grep -w Ready register: _k3s_connected_nodes - retries: 6 # there may be a delay before the server reconnects to itself + retries: 6 # task may fail if server is not ready yet delay: 10 until: not _k3s_connected_nodes.failed -- name: Generate new bootstrap token +- name: Generate new bootstrap token if not all agents are connected no_log: true when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length shell: From 19da5d5901df6bb29f34fbc0c5452d0bb9bfdde4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 26 Feb 2025 09:27:46 +0000 Subject: [PATCH 11/14] k3s agent no longer uses task vars --- ansible/extras.yml | 2 -- ansible/roles/k3s/tasks/server-runtime.yml | 4 ++++ environments/common/inventory/group_vars/all/k3s.yml | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/k3s.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index f97497b3c..7be436c39 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -12,8 +12,6 @@ tags: k3s tasks: - name: Start k3s agents - vars: # set outside of role to allow compute init to define own value - k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first]._k3s_token_output.stdout | default('') }}" ansible.builtin.include_role: name: k3s tasks_from: agent-runtime.yml diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml index 77eb5c980..3f9636d94 100644 --- a/ansible/roles/k3s/tasks/server-runtime.yml +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -28,3 +28,7 @@ shell: cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" register: _k3s_token_output + +- name: Set bootstrap token as fact + set_fact: + k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" diff --git a/environments/common/inventory/group_vars/all/k3s.yml b/environments/common/inventory/group_vars/all/k3s.yml new file mode 100644 index 000000000..a7ba0a0bf --- /dev/null +++ b/environments/common/inventory/group_vars/all/k3s.yml @@ -0,0 +1 @@ +k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}" From 33b84bcccd1b0721c70acc6b1027386a2c77e593 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 26 Feb 2025 13:55:44 +0000 Subject: [PATCH 12/14] added comment for moving k3s --- ansible/extras.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/extras.yml b/ansible/extras.yml index 7be436c39..c7cacb877 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -7,6 +7,8 @@ name: k3s tasks_from: server-runtime.yml +# technically should be part of bootstrap.yml but hangs waiting on failed mounts +# if runs before filesystems.yml after the control node has been reimaged - hosts: k3s_agent:!builder become: yes tags: k3s From 051ca04595b2e81e50f15f73ef661096806e1704 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 27 Feb 2025 14:54:07 +0000 Subject: [PATCH 13/14] Explicitly settings empty string default for bootstrap token Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/k3s/defaults/main.yml | 2 +- ansible/roles/k3s/tasks/agent-runtime.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index b7c5e91c9..984c63df9 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -3,6 +3,6 @@ k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 -k3s_bootstrap_token: "{{ None }}" # ansible managed +k3s_bootstrap_token: '' # matches common environment default k3s_bootstrap_token_expiry: 10m k3s_server_name: "{{ None }}" # ansible managed diff --git a/ansible/roles/k3s/tasks/agent-runtime.yml b/ansible/roles/k3s/tasks/agent-runtime.yml index 3a961e898..b31d5ebf5 100644 --- a/ansible/roles/k3s/tasks/agent-runtime.yml +++ b/ansible/roles/k3s/tasks/agent-runtime.yml @@ -1,7 +1,7 @@ --- - name: Template k3s agent env file - when: k3s_bootstrap_token != "" + when: k3s_bootstrap_token != '' ansible.builtin.template: dest: /etc/systemd/system/k3s-agent.service.env src: k3s-agent.service.env.j2 From 5fb1d3382febca1c55fd4342e7e8a9cdccb468ab Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 27 Feb 2025 14:57:02 +0000 Subject: [PATCH 14/14] secure pems for agent environment file --- ansible/roles/k3s/tasks/agent-runtime.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ansible/roles/k3s/tasks/agent-runtime.yml b/ansible/roles/k3s/tasks/agent-runtime.yml index b31d5ebf5..8377817ce 100644 --- a/ansible/roles/k3s/tasks/agent-runtime.yml +++ b/ansible/roles/k3s/tasks/agent-runtime.yml @@ -5,6 +5,9 @@ ansible.builtin.template: dest: /etc/systemd/system/k3s-agent.service.env src: k3s-agent.service.env.j2 + owner: root + group: root + mode: 0640 register: _k3s_agent_token_result - name: Ensure password directory exists