diff --git a/README.md b/README.md index 54b74d799..9626fb369 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ Before starting ensure that: - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). - Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. +- All instances will get a default route (needed for `k3s`). Normally the gateway + is defined on the relevant subnet. ### Setup deploy host diff --git a/docs/production.md b/docs/production.md index 59b9f3775..73489ac6d 100644 --- a/docs/production.md +++ b/docs/production.md @@ -122,6 +122,35 @@ and referenced from the `site` and `production` environments, e.g.: - If floating IPs are required for login nodes, modify the OpenTofu configurations appropriately. +- The main [README.md](../README.md) notes that all nodes require a default + route. This is to [allow k3s](https://docs.k3s.io/installation/airgap#default-network-route) + to detect the node's primary IP. Normally nodes get a default route from the + gateway defined on the subnet, but if networking must differ between hosts this + can be problematic. For example if the cluster has two networks with only + some nodes dual-homed, a gateway cannot be set on both subnets as this would + create routing problems for the dual-homed nodes. In this case set + `gateway_nmcli_connection = "dummy0"` in the OpenTofu compute group definition(s) + to create a dummy route using cloud-init as per the linked k3s docs, e.g.: + + ```terraform + # environments/$ENV/tofu/main.tf: + ... + compute = { + general = { + flavor = "general.v1.small" + nodes = [ + "general-0", + "general-1", + ] + gateway_nmcli_connection = "dummy0" + } + ... + ``` + + Note that the `gateway_nmcli_connection` and `gateway_ip` options can also be + used to set a real default route in cases where the gateway cannot be defined + on the subnet for some reason. + - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf index baf28aaf9..f5c51433c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf @@ -19,6 +19,8 @@ module "compute" { volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + gateway_nmcli_connection = lookup(each.value, "gateway_nmcli_connection", "") + gateway_ip = lookup(each.value, "gateway_ip", "") compute_init_enable = lookup(each.value, "compute_init_enable", []) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf index 9bb75466e..fee03098d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf @@ -86,6 +86,18 @@ resource "openstack_compute_instance_v2" "compute" { user_data = <<-EOF #cloud-config fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + + runcmd: +%{ if var.gateway_nmcli_connection == "dummy0" ~} + - nmcli connection add type dummy ifname dummy0 con-name dummy0 + - nmcli connection modify dummy0 ipv4.address ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.gateway ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.route-metric 1000 ipv4.method manual +%{ endif ~} +%{ if (var.gateway_nmcli_connection != "") && (var.gateway_nmcli_connection != "dummy0") ~} + - nmcli connection modify '${var.gateway_nmcli_connection}' ipv4.address ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.gateway ${var.gateway_ip} +%{ endif ~} +%{ if var.gateway_nmcli_connection != "" } + - nmcli connection up '${var.gateway_nmcli_connection}' +%{ endif ~} EOF } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf index b0e489017..05d6b162e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf @@ -93,4 +93,16 @@ variable "compute_init_enable" { type = list(string) description = "Groups to activate for ansible-init compute rebuilds" default = [] -} \ No newline at end of file +} + +variable "gateway_nmcli_connection" { + description = "Name of nmcli connection for default gateway, '' for none or 'dummy0' to create a dummy interface" + type = string + default = "" +} + +variable "gateway_ip" { + description = "IP of default gateway. Ignored when gateway_nmcli_connection == 'dummy0'" + type = string + default = "" +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 7b0b695d3..9b1218906 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -60,6 +60,11 @@ variable "compute" { Values are a mapping with: size: Size of volume in GB **NB**: The order in /dev is not guaranteed to match the mapping + gateway_nmcli_connection: Name of nmcli connection to set a default + route on via cloud-init, e.g. "System eth0" + or "Bond bond0". Use "dummy0" to create + a dummy interface with dummy route. + gateway_ip: IP of default gateway. Ignored when gateway_nmcli_connection == "dummy0". EOF }