diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 5a7137c5e..5b873fb31 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -313,10 +313,11 @@ - include_role: name: azimuth_cloud.image_utils.linux_ansible_init -- hosts: k3s +- hosts: k3s:&builder become: yes tags: k3s tasks: - - ansible.builtin.include_role: + - name: Install k3s + ansible.builtin.include_role: name: k3s tasks_from: install.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index 72c76b3b1..c7cacb877 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,3 +1,23 @@ +- hosts: k3s_server:!builder + become: yes + tags: k3s + tasks: + - name: Start k3s server + ansible.builtin.include_role: + name: k3s + tasks_from: server-runtime.yml + +# technically should be part of bootstrap.yml but hangs waiting on failed mounts +# if runs before filesystems.yml after the control node has been reimaged +- hosts: k3s_agent:!builder + become: yes + tags: k3s + tasks: + - name: Start k3s agents + ansible.builtin.include_role: + name: k3s + tasks_from: agent-runtime.yml + - hosts: basic_users:!builder become: yes tags: diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index ba9a1a899..984c63df9 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -3,3 +3,6 @@ k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 +k3s_bootstrap_token: '' # matches common environment default +k3s_bootstrap_token_expiry: 10m +k3s_server_name: "{{ None }}" # ansible managed diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml deleted file mode 100644 index 4ff72bcd8..000000000 --- a/ansible/roles/k3s/files/start_k3s.yml +++ /dev/null @@ -1,44 +0,0 @@ -- hosts: localhost - become: true - vars: - os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.control_address }}" - service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" - access_ip: "{{ os_metadata.meta.access_ip }}" - tasks: - - name: Ensure password directory exists - ansible.builtin.file: - path: "/etc/rancher/node" - state: directory - - - name: Set agent node password as token # uses token to keep password consistent between reimages - ansible.builtin.copy: - dest: /etc/rancher/node/password - content: "{{ k3s_token }}" - - - name: Add the token for joining the cluster to the environment - no_log: true # avoid logging the server token - ansible.builtin.lineinfile: - path: "/etc/systemd/system/{{ service_name }}.service.env" - line: "K3S_TOKEN={{ k3s_token }}" - - - name: Add the node IP to the environment - # NB this isn't natively setable via envvars, have to modify - # INSTALL_K3S_EXEC to support it - ansible.builtin.lineinfile: - path: "/etc/systemd/system/{{ service_name }}.service.env" - line: "K3S_NODE_IP={{ access_ip }}" - - - name: Add server url to agents - ansible.builtin.lineinfile: - path: "/etc/systemd/system/{{ service_name }}.service.env" - line: "K3S_URL=https://{{ k3s_server_name }}:6443" - when: k3s_server_name is defined - - - name: Start k3s service - ansible.builtin.systemd: - name: "{{ service_name }}" - daemon_reload: true - state: started - enabled: true diff --git a/ansible/roles/k3s/tasks/agent-runtime.yml b/ansible/roles/k3s/tasks/agent-runtime.yml new file mode 100644 index 000000000..8377817ce --- /dev/null +++ b/ansible/roles/k3s/tasks/agent-runtime.yml @@ -0,0 +1,35 @@ +--- + +- name: Template k3s agent env file + when: k3s_bootstrap_token != '' + ansible.builtin.template: + dest: /etc/systemd/system/k3s-agent.service.env + src: k3s-agent.service.env.j2 + owner: root + group: root + mode: 0640 + register: _k3s_agent_token_result + +- name: Ensure password directory exists + ansible.builtin.file: + path: "/etc/rancher/node" + state: directory + owner: root + group: root + mode: 0640 + +- name: Write node password + ansible.builtin.copy: + dest: /etc/rancher/node/password + content: "{{ vault_k3s_node_password }}" + owner: root + group: root + mode: 0640 # normal k3s install is 644 but that doesn't feel right + +- name: Start/restart k3s agent + when: _k3s_agent_token_result.changed + ansible.builtin.systemd: + name: k3s-agent + daemon_reload: true + state: restarted + enabled: true diff --git a/ansible/roles/k3s/tasks/install.yml b/ansible/roles/k3s/tasks/install.yml index 579a75bc4..c250f87a8 100644 --- a/ansible/roles/k3s/tasks/install.yml +++ b/ansible/roles/k3s/tasks/install.yml @@ -71,8 +71,3 @@ ansible.builtin.lineinfile: path: /etc/environment line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" - -- name: Install ansible-init playbook for k3s agent or server activation - copy: - src: start_k3s.yml - dest: /etc/ansible-init/playbooks/0-start-k3s.yml diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml new file mode 100644 index 000000000..3f9636d94 --- /dev/null +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -0,0 +1,34 @@ +--- + +- name: Template k3s env file + ansible.builtin.template: + dest: /etc/systemd/system/k3s.service.env + src: k3s.service.env.j2 + register: _k3s_env_file_status + +- name: Start k3s server + ansible.builtin.systemd: + name: k3s + daemon_reload: "{{ _k3s_env_file_status.changed }}" + state: started + enabled: true + +# Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though +- name: Check which k3s agents are connected + ansible.builtin.shell: + cmd: kubectl get nodes --no-headers | grep -w Ready + register: _k3s_connected_nodes + retries: 6 # task may fail if server is not ready yet + delay: 10 + until: not _k3s_connected_nodes.failed + +- name: Generate new bootstrap token if not all agents are connected + no_log: true + when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length + shell: + cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" + register: _k3s_token_output + +- name: Set bootstrap token as fact + set_fact: + k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" diff --git a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 new file mode 100644 index 000000000..b994b0680 --- /dev/null +++ b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 @@ -0,0 +1,3 @@ +K3S_NODE_IP={{ ansible_host }} +K3S_TOKEN={{ k3s_bootstrap_token }} +K3S_URL=https://{{ k3s_server_name }}:6443 diff --git a/ansible/roles/k3s/templates/k3s.service.env.j2 b/ansible/roles/k3s/templates/k3s.service.env.j2 new file mode 100644 index 000000000..746e6d809 --- /dev/null +++ b/ansible/roles/k3s/templates/k3s.service.env.j2 @@ -0,0 +1 @@ +K3S_NODE_IP={{ ansible_host }} diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 929aac465..0dbe66dd8 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -8,7 +8,7 @@ slurm_appliance_secrets: vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}" vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" - vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" + vault_k3s_node_password: "{{ vault_k3s_node_password | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 deleted file mode 100644 index 2a8fabba8..000000000 --- a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 +++ /dev/null @@ -1,3 +0,0 @@ -{ - "k3s_token": "{{ vault_k3s_token }}" -} \ No newline at end of file diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 99bca2f54..0bd6001dc 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250211-1540-a0b4a57e", - "RL9": "openhpc-RL9-250211-1540-a0b4a57e" + "RL8": "openhpc-RL8-250221-0904-e4ff694e", + "RL9": "openhpc-RL9-250221-0904-e4ff694e" } } diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index d53c30788..46c7badd0 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -66,9 +66,6 @@ module "cluster" { key_pair = "slurm-app-ci" cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor - # have to override default, as unusually the actual module path and secrets - # are not in the same environment for stackhpc - inventory_secrets_path = "${path.module}/../inventory/group_vars/all/secrets.yml" login = { login: { diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 23aafd73e..d96eed501 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -21,6 +21,7 @@ opensearch_address: "127.0.0.1" prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}" +k3s_server_name: "{{ hostvars[groups['k3s_server'] | first].ansible_host }}" ############################# bootstrap: local user configuration ######################### diff --git a/environments/common/inventory/group_vars/all/k3s.yml b/environments/common/inventory/group_vars/all/k3s.yml new file mode 100644 index 000000000..a7ba0a0bf --- /dev/null +++ b/environments/common/inventory/group_vars/all/k3s.yml @@ -0,0 +1 @@ +k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 7fd73511d..632e1f25b 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -145,8 +145,16 @@ freeipa_client [compute_init] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on -[k3s] +[k3s:children] # Hosts to run k3s server/agent +k3s_server +k3s_agent + +[k3s_server] +# Hosts to run k3s server (should only be single node i.e control node) + +[k3s_agent] +# Hosts to run k3s agent [k9s] # Hosts to install k9s on diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 5b325f108..ab5e1be5c 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -96,9 +96,14 @@ cluster [compute_init] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on -[k3s:children] -# Hosts to run k3s server/agent -openhpc +[k3s_server:children] +# Hosts to run k3s server (should only be single node i.e control node) +control + +[k3s_agent:children] +# Hosts to run k3s agent +compute +login [k9s:children] # Hosts to install k9s on diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf index fe614a101..555b9858d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf @@ -29,7 +29,6 @@ module "compute" { availability_zone = lookup(each.value, "availability_zone", "nova") # computed - k3s_token = local.k3s_token # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf index a8a684d31..26597b1ac 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf @@ -60,7 +60,6 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root - k3s_token = local.k3s_token access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf index 9c6ba76c3..443c52282 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf @@ -1,11 +1,3 @@ -data "external" "inventory_secrets" { - program = ["${path.module}/read-inventory-secrets.py"] - - query = { - path = var.inventory_secrets_path == "" ? "${path.module}/../inventory/group_vars/all/secrets.yml" : var.inventory_secrets_path - } -} - data "external" "baremetal_nodes" { # returns an empty map if cannot list baremetal nodes program = ["${path.module}/baremetal-node-list.py"] diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf index d36c099b2..8ce8115a8 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf @@ -33,7 +33,6 @@ module "login" { ignore_image_changes = false # computed - k3s_token = local.k3s_token # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index a6e850136..186b8538b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -86,7 +86,6 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { metadata = merge( { environment_root = var.environment_root - k3s_token = var.k3s_token control_address = var.control_address access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] }, @@ -140,7 +139,6 @@ resource "openstack_compute_instance_v2" "compute" { metadata = merge( { environment_root = var.environment_root - k3s_token = var.k3s_token control_address = var.control_address access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] }, diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 8a3f03876..05cbf286f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -70,10 +70,6 @@ variable "security_group_ids" { type = list } -variable "k3s_token" { - type = string -} - variable "control_address" { description = "Name/address of control node" type = string diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 11b5b771d..8d5808ba7 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -184,13 +184,3 @@ variable "root_volume_size" { type = number default = 40 } - -variable "inventory_secrets_path" { - description = "Path to inventory secrets.yml file. Default is standard cookiecutter location." - type = string - default = "" -} - -locals { - k3s_token = data.external.inventory_secrets.result["vault_k3s_token"] -}