From 0dcf774b65106fe125f6bdebafee08bc3e11e002 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 5 Mar 2025 15:19:59 +0000 Subject: [PATCH 01/15] FIX: Tofu attempts to apply security groups when port_security_enabled is false (#601) * fix security_group_id logic * toggle secgroups without touching port security * document no_security_groups flag --- .../skeleton/{{cookiecutter.environment}}/tofu/control.tf | 4 ++-- .../{{cookiecutter.environment}}/tofu/node_group/nodes.tf | 6 +++--- .../skeleton/{{cookiecutter.environment}}/tofu/variables.tf | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf index 6e52a3aed..dc1c05b3b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf @@ -14,8 +14,8 @@ resource "openstack_networking_port_v2" "control" { subnet_id = data.openstack_networking_subnet_v2.cluster_subnet[each.key].id } - port_security_enabled = lookup(each.value, "port_security_enabled", null) - security_group_ids = lookup(each.value, "port_security_enabled", null) != false ? [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] : [] + no_security_groups = lookup(each.value, "no_security_groups", false) + security_group_ids = lookup(each.value, "no_security_groups", false) ? [] : [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] binding { vnic_type = lookup(var.vnic_types, each.key, "normal") diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index 426689bb9..f5d3424e6 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -44,9 +44,9 @@ resource "openstack_networking_port_v2" "compute" { fixed_ip { subnet_id = data.openstack_networking_subnet_v2.subnet[each.value.network].id } - - port_security_enabled = lookup(each.value, "port_security_enabled", null) - security_group_ids = lookup(each.value, "port_security_enabled", null) != false ? var.security_group_ids : [] + + no_security_groups = lookup(each.value, "no_security_groups", false) + security_group_ids = lookup(each.value, "no_security_groups", false) ? [] : var.security_group_ids binding { vnic_type = lookup(var.vnic_types, each.value.network, "normal") diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index bbcef8734..73c872feb 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -15,7 +15,7 @@ variable "cluster_networks" { List of mappings defining networks. Mapping key/values: network: Required. Name of existing network subnet: Required. Name of existing subnet - port_security_enabled: Optional. Bool, default null (for networks not owned by project) + no_security_groups: Optional. Bool (default: false). Disable security groups EOT } From 0e2ec5277d0bf6fd60eff735cd6475d8aeb0a15c Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 5 Mar 2025 15:31:31 +0000 Subject: [PATCH 02/15] Add file deletion to cleanup play (#600) * add file deletion to cleanup play * bump CI image * add bacin deleted OOD file and fix paths in /etc * bump CI image --- ansible/cleanup.yml | 16 +++++++++++++++- environments/.stackhpc/hooks/post.yml | 14 -------------- .../tofu/cluster_image.auto.tfvars.json | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) delete mode 100644 environments/.stackhpc/hooks/post.yml diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 670a99b29..744f9b657 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -38,7 +38,21 @@ - name: Cleanup /tmp command : rm -rf /tmp/* - + +- name: Delete files triggering vulnerability scans + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: # NB: items here MUST have a justification! + # ondemand install: raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw + # All declared not to be an issue by Open Ondemand as relevant packages not installed + - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" + - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock + # chrony role: only used for role dev, venv never created on disk + - /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/poetry.lock + - /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/requirements.txt + - name: Get package facts package_facts: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml deleted file mode 100644 index 9d506d725..000000000 --- a/environments/.stackhpc/hooks/post.yml +++ /dev/null @@ -1,14 +0,0 @@ -- hosts: openondemand - become: yes - gather_facts: false - tasks: - - name: Delete ondemand files causing Trivy scan false-positives - # Raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw - # All declared not to be an issue by Open Ondemand as relevant packages not installed - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: - - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 0bd6001dc..3e8293206 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250221-0904-e4ff694e", - "RL9": "openhpc-RL9-250221-0904-e4ff694e" + "RL8": "openhpc-RL8-250305-1110-534ed276", + "RL9": "openhpc-RL9-250305-1110-534ed276" } } From 651c7222f5964a35b2d2a7f572231c410372a51c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 5 Mar 2025 12:06:04 +0000 Subject: [PATCH 03/15] define desired behaviour --- docs/networks.md | 94 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 9 deletions(-) diff --git a/docs/networks.md b/docs/networks.md index 4556ac623..73e603675 100644 --- a/docs/networks.md +++ b/docs/networks.md @@ -8,11 +8,13 @@ subnets or associated infrastructure such as routers. The requirements are that: 4. At least one network on each node provides outbound internet access (either directly, or via a proxy). -Futhermore, it is recommended that the deploy host has an interface on the -access network. While it is possible to e.g. use a floating IP on a login node -as an SSH proxy to access the other nodes, this can create problems in recovering -the cluster if the login node is unavailable and can make Ansible problems harder -to debug. +Addresses on the "access network" are used for `ansible_host` and `k3s` node IPs. + +It is recommended that the deploy host either has a direct connection to the +"access network" or jumps through a host on it which is not part of the appliance. +Using e.g. a floating IP on a login node as a jumphost creates problems in +recovering the cluster if the login node is unavailable and can make Ansible +problems harder to debug. > [!WARNING] > If home directories are on a shared filesystem with no authentication (such @@ -29,8 +31,8 @@ the OpenTofu variables. These will normally be set in need to be overriden for specific environments, this can be done via an OpenTofu module as discussed [here](./production.md). -Note that if an OpenStack subnet has a gateway IP defined then nodes with ports -attached to that subnet will get a default route set via that gateway. +Note that if an OpenStack subnet has a gateway IP defined then by default nodes +with ports attached to that subnet get a default route set via that gateway. ## Single network This is the simplest possible configuration. A single network and subnet is @@ -77,8 +79,9 @@ vnic_types = { ## Additional networks on some nodes This example shows how to modify variables for specific node groups. In this -case a baremetal node group has a second network attached. As above, only a -single subnet can have a gateway IP. +case a baremetal node group has a second network attached. Here "subnetA" must +have a gateway IP defined and "subnetB" must not, to avoid routing problems on +the multi-homeed compute nodes. ```terraform cluster_networks = [ @@ -109,3 +112,76 @@ compute = { } ... ``` + +## Multiple networks with non-default gateways + +In some multiple network configurations it may be necessary to manage default +routes rather than them being automatically created from a subnet gateway. +This can be done using the tofu variable `gateway_ip` which can be set for the +cluster and/or overriden on the compute and login groups. If this is set: +- a default route via that address will be created on the appropriate interface + during boot if it does not exist +- any other default routes will be removed + +For example the cluster configuration below has a "campus" network with a +default gateway which provides inbound SSH / ondemand access and outbound +internet attached only to the login nodes, and a "data" network attached to +all nodes. The "data" network has no gateway IP set on its subnet to avoid dual +default routes and routing conflicts on the multi-homed login nodes, but does +have outbound connectivity via a router: + +```terraform +cluster_networks = [ + { + network = "data" # access network, CIDR 172.16.0.0/23 + subnet = "data_subnet" + } +] + +login = { + interactive = { + nodes = ["login-0"] + extra_networks = [ + { + network = "campus" + subnet = "campus_subnet" + } + ] + } +} +compute = { + general = { + nodes = ["compute-0", "compute-1"] + } + gateway_ip = "172.16.0.1" # Router interface +} +``` + +If there is no default route at all (either from a subnet gateway or from +`gateway_ip`) then a dummy route is created via the access network interface to +ensure [correct](https://docs.k3s.io/installation/airgap#default-network-route) +`k3s` operation. + +## Proxies + +If some nodes have no outbound connectivity via any networks, the cluster can +be configured to deploy a [squid proxy](https://www.squid-cache.org/) on a node +with outbound connectivity. Assuming the `compute` and `control` nodes have no +outbound connectivity and the `login` node does, the minimal configuration for +this is: + +```yaml +# environments/$SITE/inventory/groups: +[squid:children] +login +[proxy:children] +control +compute +``` + +```yaml +# environments/$SITE/inventory/group_vars/all/squid.yml: +# these are just examples +squid_cache_disk: 1024 # MB +squid_cache_mem: '12 GB' +``` From a83eb184e18ae53afeacfb4ae87183128ed51927 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 5 Mar 2025 15:51:15 +0000 Subject: [PATCH 04/15] add gateway ansible-init role --- ansible/.gitignore | 2 + ansible/fatimage.yml | 11 ++- ansible/roles/gateway/files/gateway-init.yml | 93 ++++++++++++++++++++ ansible/roles/gateway/tasks/main.yml | 7 ++ environments/common/inventory/groups | 3 + environments/common/layouts/everything | 4 + 6 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/gateway/files/gateway-init.yml create mode 100644 ansible/roles/gateway/tasks/main.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index af10c417a..8e0b7c935 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -86,3 +86,5 @@ roles/* !roles/pytools/** !roles/rebuild/ !roles/rebuild/** +!roles/gateway/ +!roles/gateway/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 3d21d109f..2765641e3 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -79,7 +79,7 @@ - import_playbook: extras.yml # TODO: is this the right place? -- name: Install compute_init script +- name: Install compute_init playbook hosts: compute_init tags: compute_init # tagged to allow running on cluster instances for dev become: yes @@ -88,6 +88,15 @@ name: compute_init tasks_from: install.yml +- name: Install gateway playbook + hosts: gateway + tags: compute_init + become: yes + gather_facts: no + tasks: + - include_role: + name: gateway + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/roles/gateway/files/gateway-init.yml b/ansible/roles/gateway/files/gateway-init.yml new file mode 100644 index 000000000..08301cbd3 --- /dev/null +++ b/ansible/roles/gateway/files/gateway-init.yml @@ -0,0 +1,93 @@ +- hosts: localhost + #become: true + gather_facts: false + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + #gateway_ip: "{{ os_metadata.meta.gateway_ip | default('') }}" + access_ip: "{{ os_metadata.meta.access_ip | default('') }}" + gateway_ip: 172.16.0.1 # DEBUG - actual + # gateway_ip: 192.168.9.1 + #gateway_ip: 10.20.0.0 + gateway_ip: '' + tasks: + - name: Read nmcli device info + command: nmcli --get GENERAL.DEVICE,GENERAL.CONNECTION,IP4.ADDRESS,IP4.GATEWAY device show + register: _nmcli_device_raw + changed_when: false + + - name: Set fact for nmcli devices + set_fact: + # creates a dict with keys as per zip arg below, values might be '' + nmcli_devices: >- + {{ + _nmcli_device_raw.stdout_lines | + batch(5, '') | + map('zip', ['device', 'connection', 'ip4_address', 'ip4_gateway']) | + map('map', 'reverse') | map('community.general.dict') + }} + # batch=5 because per device have 4x lines + blank line between devices + # batch takes default '' because last devices doesn't have trailing blank line + + - name: Examine whether device address contains gateway_ip + set_fact: + device_is_gateway_device: "{{ nmcli_devices | map(attribute='ip4_address') | map('ansible.utils.network_in_network', gateway_ip) }}" + + - name: Get name of connection containing gateway_ip + # might be empty string + set_fact: + gateway_ip_connection: >- + {{ nmcli_devices | map(attribute='connection') | + zip(device_is_gateway_device) | selectattr('1') | + map(attribute=0) | list | first | default ('') }} + + - name: Error if device has a gateway which is not the desired one + # TODO: document + assert: + that: item.gateway == gateway_ip + fail_msg: "Device {{ item | to_nice_json }} has gateway: cannot apply gateway {{ gateway_ip }}" + when: + - item.connection == gateway_ip_connection + - item.ip4_gateway != '' + - item.ip4_gateway != gateway_ip + loop: "{{ nmcli_devices }}" + + - name: Remove undesired gateways + command: >- + echo nmcli connection modify '{{ item.connection }}' ipv4.gateway '' + && + echo nmcli connection up '{{ item.connection }}' + when: + - gateway_ip != '' + - item.ip4_gateway != '' + - item.connection != gateway_ip_connection + loop: "{{ nmcli_devices }}" + + - name: Add desired gateways + command: >- + echo nmcli connection modify '{{ item.connection }}' + ipv4.address {{ item.ip4_address }} + ipv4.gateway {{ gateway_ip }} + && + echo nmcli connection up '{{ item.connection }}' + when: + - gateway_ip != '' + - item.ip4_gateway != gateway_ip + - item.connection == gateway_ip_connection + loop: "{{ nmcli_devices }}" + + - name: Create dummy connection and gateway + # see https://docs.k3s.io/installation/airgap#default-network-route + command: >- + nmcli connection add type dummy ifname dummy0 con-name dummy0 + && + nmcli connection modify dummy0 + ipv4.address {{ access_ip }} + ipv4.gateway {{ access_ip }} + ipv4.route-metric 1000 + ipv4.method manual + && + nmcli connection up dummy0 + when: + - gateway_ip == '' # no gateway specified + - nmcli_devices | selectattr('ip4_gateway', 'ne', '') | length == 0 + # no gateway from networks diff --git a/ansible/roles/gateway/tasks/main.yml b/ansible/roles/gateway/tasks/main.yml new file mode 100644 index 000000000..c13ba5ce9 --- /dev/null +++ b/ansible/roles/gateway/tasks/main.yml @@ -0,0 +1,7 @@ +- name: Add gateway playbook + copy: + src: gateway-init.yml + dest: /etc/ansible-init/playbooks/05-gateway-init.yml + owner: root + group: root + mode: 0644 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 5317ab4e0..af519b871 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -169,3 +169,6 @@ extra_packages [chrony] # Hosts where crony configuration is applied. See docs/chrony.md for more details. + +[gateway] +# Add builder to this group to install gateway ansible-init playbook into image diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index d78202843..09670084e 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -112,3 +112,7 @@ builder [chrony] # Hosts where crony configuration is applied. See docs/chrony.md for more details. + +[gateway:children] +# Add builder to this group to install gateway ansible-init playbook into image +builder From 3a4be9a34025d45004bba6b0542f596fb10d456d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 5 Mar 2025 15:52:07 +0000 Subject: [PATCH 05/15] move compute_init playbook --- ansible/roles/compute_init/tasks/install.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index cbacb062e..6032eed53 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -60,7 +60,7 @@ - name: Add compute initialisation playbook copy: src: compute-init.yml - dest: /etc/ansible-init/playbooks/1-compute-init.yml + dest: /etc/ansible-init/playbooks/10-compute-init.yml owner: root group: root mode: 0644 From bd87bf71b7a65f61e617e7fc53c21b443c900611 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 5 Mar 2025 15:55:46 +0000 Subject: [PATCH 06/15] remove debug values --- ansible/roles/gateway/files/gateway-init.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ansible/roles/gateway/files/gateway-init.yml b/ansible/roles/gateway/files/gateway-init.yml index 08301cbd3..818fc9e3e 100644 --- a/ansible/roles/gateway/files/gateway-init.yml +++ b/ansible/roles/gateway/files/gateway-init.yml @@ -3,12 +3,8 @@ gather_facts: false vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - #gateway_ip: "{{ os_metadata.meta.gateway_ip | default('') }}" + gateway_ip: "{{ os_metadata.meta.gateway_ip | default('') }}" access_ip: "{{ os_metadata.meta.access_ip | default('') }}" - gateway_ip: 172.16.0.1 # DEBUG - actual - # gateway_ip: 192.168.9.1 - #gateway_ip: 10.20.0.0 - gateway_ip: '' tasks: - name: Read nmcli device info command: nmcli --get GENERAL.DEVICE,GENERAL.CONNECTION,IP4.ADDRESS,IP4.GATEWAY device show From 6eac0e27147b70b6faca213a0f6cb36c25d15a14 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 5 Mar 2025 17:03:39 +0000 Subject: [PATCH 07/15] support network filters in ansible-init --- .../common/inventory/group_vars/all/ansible_init.yml | 8 +++++++- requirements.yml | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/environments/common/inventory/group_vars/all/ansible_init.yml b/environments/common/inventory/group_vars/all/ansible_init.yml index be68dbe8c..b137af82b 100644 --- a/environments/common/inventory/group_vars/all/ansible_init.yml +++ b/environments/common/inventory/group_vars/all/ansible_init.yml @@ -1 +1,7 @@ -ansible_init_wait: 1200 # seconds \ No newline at end of file +ansible_init_wait: 1200 # seconds +ansible_init_pip_packages: + - ansible + - jmespath + - requests + # custom below here + - netaddr diff --git a/requirements.yml b/requirements.yml index 7b0dc62d2..a18d7c562 100644 --- a/requirements.yml +++ b/requirements.yml @@ -36,7 +36,7 @@ collections: version: 0.4.0 - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git - version: 0.4.0 + version: feat/pip-deps # TODO: bump on release - name: kubernetes.core version: 2.4.2 # stackhpc.pulp has pulp.squeezer as dependency, any version, but latest From 7f8fe791da622a58d58171f7a71e04e7b6b81fb7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 09:27:08 +0000 Subject: [PATCH 08/15] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 3e8293206..bdce54838 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250305-1110-534ed276", - "RL9": "openhpc-RL9-250305-1110-534ed276" + "RL8": "openhpc-RL8-250305-1707-6eac0e27", + "RL9": "openhpc-RL9-250305-1707-6eac0e27" } } From 9fbc90e5eafb103eda4f7deba6da68def54dc904 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 11:24:24 +0000 Subject: [PATCH 09/15] support gateway_ip in TF --- .../{{cookiecutter.environment}}/tofu/compute.tf | 1 + .../{{cookiecutter.environment}}/tofu/control.tf | 1 + .../skeleton/{{cookiecutter.environment}}/tofu/login.tf | 1 + .../tofu/node_group/nodes.tf | 2 ++ .../tofu/node_group/variables.tf | 5 +++++ .../{{cookiecutter.environment}}/tofu/variables.tf | 9 ++++++++- 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf index 87ff662a5..7ab27d84f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf @@ -18,6 +18,7 @@ module "compute" { vnic_types = lookup(each.value, "vnic_types", var.vnic_types) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf index dc1c05b3b..b4308f93d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf @@ -60,6 +60,7 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + gateway_ip = var.gateway_ip } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf index bbfad9cb4..c4a2c74b3 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf @@ -18,6 +18,7 @@ module "login" { vnic_types = lookup(each.value, "vnic_types", var.vnic_types) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) # optionally set for group networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index f5d3424e6..f08ec1ca3 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -87,6 +87,7 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { environment_root = var.environment_root control_address = var.control_address access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] + gateway_ip = var.gateway_ip }, {for e in var.compute_init_enable: e => true} ) @@ -140,6 +141,7 @@ resource "openstack_compute_instance_v2" "compute" { environment_root = var.environment_root control_address = var.control_address access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] + gateway_ip = var.gateway_ip }, {for e in var.compute_init_enable: e => true} ) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 224d25b47..896a28a48 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -122,3 +122,8 @@ variable "baremetal_nodes" { type = map(string) default = {} } + +variable "gateway_ip" { + type = string + default = "" +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 73c872feb..f8cee09bf 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -58,9 +58,9 @@ variable "login" { must already be allocated to the project. fip_network: Name of network containing ports to attach FIPs to. Only required if multiple networks are defined. - match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node availability_zone: Name of availability zone - ignored unless match_ironic_node is true (default: "nova") + gateway_ip: Address to add default route via EOF } @@ -96,6 +96,7 @@ variable "compute" { **NB**: The order in /dev is not guaranteed to match the mapping match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node availability_zone: Name of availability zone - ignored unless match_ironic_node is true (default: "nova") + gateway_ip: Address to add default route via EOF } @@ -172,3 +173,9 @@ variable "root_volume_size" { type = number default = 40 } + +variable "gateway_ip" { + description = "Address to add default route via" + type = string + default = "" +} From 366232da5aa68e94b847bd77e68e56e3e7e68b89 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 12:26:51 +0000 Subject: [PATCH 10/15] fix gateway tag --- ansible/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 2765641e3..b095be7e6 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -90,7 +90,7 @@ - name: Install gateway playbook hosts: gateway - tags: compute_init + tags: gateway become: yes gather_facts: no tasks: From a49efd81e22b6fe661213a497f1899c90c65865e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 12:28:16 +0000 Subject: [PATCH 11/15] get dummy gateway and adding gateway working --- ansible/roles/gateway/files/gateway-init.yml | 43 ++++++++++---------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/ansible/roles/gateway/files/gateway-init.yml b/ansible/roles/gateway/files/gateway-init.yml index 818fc9e3e..b6d604c5c 100644 --- a/ansible/roles/gateway/files/gateway-init.yml +++ b/ansible/roles/gateway/files/gateway-init.yml @@ -27,7 +27,8 @@ - name: Examine whether device address contains gateway_ip set_fact: device_is_gateway_device: "{{ nmcli_devices | map(attribute='ip4_address') | map('ansible.utils.network_in_network', gateway_ip) }}" - + # list of bools - false if gateway_ip == '' + - name: Get name of connection containing gateway_ip # might be empty string set_fact: @@ -35,7 +36,11 @@ {{ nmcli_devices | map(attribute='connection') | zip(device_is_gateway_device) | selectattr('1') | map(attribute=0) | list | first | default ('') }} - + + - name: Show debug info + debug: + msg: "gateway_ip={{ gateway_ip }} access_ip={{ access_ip }} gateway_ip_connection={{ gateway_ip_connection }}" + - name: Error if device has a gateway which is not the desired one # TODO: document assert: @@ -48,40 +53,36 @@ loop: "{{ nmcli_devices }}" - name: Remove undesired gateways - command: >- - echo nmcli connection modify '{{ item.connection }}' ipv4.gateway '' - && - echo nmcli connection up '{{ item.connection }}' + shell: | + nmcli connection modify '{{ item.connection }}' ipv4.gateway '' + nmcli connection up '{{ item.connection }}' when: - gateway_ip != '' - item.ip4_gateway != '' - item.connection != gateway_ip_connection loop: "{{ nmcli_devices }}" - - name: Add desired gateways - command: >- - echo nmcli connection modify '{{ item.connection }}' - ipv4.address {{ item.ip4_address }} - ipv4.gateway {{ gateway_ip }} - && - echo nmcli connection up '{{ item.connection }}' + - name: Add desired gateways # TESTED OK + shell: | + nmcli connection modify '{{ item.connection }}' \ + ipv4.address {{ item.ip4_address }} \ + ipv4.gateway {{ gateway_ip }} + nmcli connection up '{{ item.connection }}' when: - gateway_ip != '' - item.ip4_gateway != gateway_ip - item.connection == gateway_ip_connection loop: "{{ nmcli_devices }}" - - name: Create dummy connection and gateway + - name: Create dummy connection and gateway # TESTED OK # see https://docs.k3s.io/installation/airgap#default-network-route - command: >- + shell: | nmcli connection add type dummy ifname dummy0 con-name dummy0 - && - nmcli connection modify dummy0 - ipv4.address {{ access_ip }} - ipv4.gateway {{ access_ip }} - ipv4.route-metric 1000 + nmcli connection modify dummy0 \ + ipv4.address {{ access_ip }} \ + ipv4.gateway {{ access_ip }} \ + ipv4.route-metric 1000 \ ipv4.method manual - && nmcli connection up dummy0 when: - gateway_ip == '' # no gateway specified From 2d715e162120ae03f86663adae867078c0f2410e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 12:29:37 +0000 Subject: [PATCH 12/15] fail fast if ansible-init failed --- ansible/bootstrap.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 5b873fb31..e4288f1cb 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -8,7 +8,16 @@ stat: path: /etc/systemd/system/ansible-init.service register: _stat_ansible_init_unitfile - + - name: Check ansible-init status + command: systemctl is-failed ansible-init + register: _ansible_init_failed + failed_when: false # rc != 0 for non-failure! + changed_when: false + - name: Check ansible-init hasn't failed (yet) + # NB: only allows early exit if it has, does not catch future failures! + assert: + that: "'failed' not in _ansible_init_failed.stdout" + fail_msg: "ansible-init has failed - check journalctl -xeu ansible-init" - name: Wait for ansible-init to finish wait_for: path: /var/lib/ansible-init.done From d577d4b514999f0e0fe8a4076cd70d3d16950385 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 12:30:06 +0000 Subject: [PATCH 13/15] fix chrony for nodes w/o network access (yet) --- ansible/bootstrap.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e4288f1cb..22d98c2d6 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -67,6 +67,12 @@ tasks: - import_role: name: mrlesmithjr.chrony + # skip install tasks as might not have network yet + tasks_from: config_chrony.yml + vars: + # workaround for set_facts.yml: + chrony_config: /etc/chrony.conf + chrony_service: chronyd - hosts: cluster gather_facts: false From 3b9c6dcfdbaa5e97bcfb9f4adde8201456c43b5f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 6 Mar 2025 17:14:37 +0000 Subject: [PATCH 14/15] configure proxies for k3s too --- ansible/roles/proxy/tasks/main.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ansible/roles/proxy/tasks/main.yml b/ansible/roles/proxy/tasks/main.yml index 70a7eca67..d32adfdb3 100644 --- a/ansible/roles/proxy/tasks/main.yml +++ b/ansible/roles/proxy/tasks/main.yml @@ -43,10 +43,14 @@ path: /etc/systemd/system.conf.d/90-proxy.conf section: Manager option: DefaultEnvironment + # k3s uses uppercase: https://docs.k3s.io/advanced#configuring-an-http-proxy value: >- "http_proxy={{ proxy_http_proxy }}" "https_proxy={{ proxy_http_proxy }}" "no_proxy={{ proxy_no_proxy }}" + "HTTP_PROXY={{ proxy_http_proxy }}" + "HTTPS_PROXY={{ proxy_https_proxy }}" + "NO_PROXY={{ proxy_no_proxy }}" no_extra_spaces: true owner: root group: root From 3e1a047971d52a48389ade2a2673e39ae30a39cf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 7 Mar 2025 10:45:20 +0000 Subject: [PATCH 15/15] add notes on name resolution to network docs --- docs/networks.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/networks.md b/docs/networks.md index 73e603675..ac9d40381 100644 --- a/docs/networks.md +++ b/docs/networks.md @@ -162,6 +162,10 @@ If there is no default route at all (either from a subnet gateway or from ensure [correct](https://docs.k3s.io/installation/airgap#default-network-route) `k3s` operation. +When using a subnet with no default gateway, OpenStack's nameserver for the +subnet may refuse lookups. External nameservers can be defined using the +[resolv_conf](../ansible/roles/resolv_conf/README.md) role. + ## Proxies If some nodes have no outbound connectivity via any networks, the cluster can @@ -185,3 +189,8 @@ compute squid_cache_disk: 1024 # MB squid_cache_mem: '12 GB' ``` + +Note that name resolution must still be possible and may require defining an +nameserver which is directly reachable from the node using the +[resolv_conf](../ansible/roles/resolv_conf/README.md) +role.