From c9fd09af79db48476cbee171dc29e4563fe25299 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 16 Jan 2025 10:12:15 +0000 Subject: [PATCH 1/4] docs for cloud-init gateway --- README.md | 2 ++ docs/production.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/README.md b/README.md index 54b74d799..9626fb369 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ Before starting ensure that: - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). - Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. +- All instances will get a default route (needed for `k3s`). Normally the gateway + is defined on the relevant subnet. ### Setup deploy host diff --git a/docs/production.md b/docs/production.md index 59b9f3775..73489ac6d 100644 --- a/docs/production.md +++ b/docs/production.md @@ -122,6 +122,35 @@ and referenced from the `site` and `production` environments, e.g.: - If floating IPs are required for login nodes, modify the OpenTofu configurations appropriately. +- The main [README.md](../README.md) notes that all nodes require a default + route. This is to [allow k3s](https://docs.k3s.io/installation/airgap#default-network-route) + to detect the node's primary IP. Normally nodes get a default route from the + gateway defined on the subnet, but if networking must differ between hosts this + can be problematic. For example if the cluster has two networks with only + some nodes dual-homed, a gateway cannot be set on both subnets as this would + create routing problems for the dual-homed nodes. In this case set + `gateway_nmcli_connection = "dummy0"` in the OpenTofu compute group definition(s) + to create a dummy route using cloud-init as per the linked k3s docs, e.g.: + + ```terraform + # environments/$ENV/tofu/main.tf: + ... + compute = { + general = { + flavor = "general.v1.small" + nodes = [ + "general-0", + "general-1", + ] + gateway_nmcli_connection = "dummy0" + } + ... + ``` + + Note that the `gateway_nmcli_connection` and `gateway_ip` options can also be + used to set a real default route in cases where the gateway cannot be defined + on the subnet for some reason. + - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). From eb2e92c5a99c42ec11c407d8efe79e99d2573e22 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 16 Jan 2025 10:38:35 +0000 Subject: [PATCH 2/4] add cloud-init gateway setup for compute nodes --- .../terraform/compute.tf | 2 ++ .../terraform/compute/nodes.tf | 12 ++++++++++++ .../terraform/compute/variables.tf | 14 +++++++++++++- .../terraform/variables.tf | 5 +++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index a90108924..141ff3fd1 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -19,6 +19,8 @@ module "compute" { volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + gateway_nmcli_connection = lookup(each.value, "gateway_nmcli_connection", "") + gateway_ip = lookup(each.value, "gateway_ip", "") compute_init_enable = lookup(each.value, "compute_init_enable", []) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 9bb75466e..fee03098d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -86,6 +86,18 @@ resource "openstack_compute_instance_v2" "compute" { user_data = <<-EOF #cloud-config fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + + runcmd: +%{ if var.gateway_nmcli_connection == "dummy0" ~} + - nmcli connection add type dummy ifname dummy0 con-name dummy0 + - nmcli connection modify dummy0 ipv4.address ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.gateway ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.route-metric 1000 ipv4.method manual +%{ endif ~} +%{ if (var.gateway_nmcli_connection != "") && (var.gateway_nmcli_connection != "dummy0") ~} + - nmcli connection modify '${var.gateway_nmcli_connection}' ipv4.address ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.gateway ${var.gateway_ip} +%{ endif ~} +%{ if var.gateway_nmcli_connection != "" } + - nmcli connection up '${var.gateway_nmcli_connection}' +%{ endif ~} EOF } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index b0e489017..05d6b162e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -93,4 +93,16 @@ variable "compute_init_enable" { type = list(string) description = "Groups to activate for ansible-init compute rebuilds" default = [] -} \ No newline at end of file +} + +variable "gateway_nmcli_connection" { + description = "Name of nmcli connection for default gateway, '' for none or 'dummy0' to create a dummy interface" + type = string + default = "" +} + +variable "gateway_ip" { + description = "IP of default gateway. Ignored when gateway_nmcli_connection == 'dummy0'" + type = string + default = "" +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index bdffd40ce..5a5722175 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -60,6 +60,11 @@ variable "compute" { Values are a mapping with: size: Size of volume in GB **NB**: The order in /dev is not guaranteed to match the mapping + gateway_nmcli_connection: Name of nmcli connection to set a default + route on via cloud-init, e.g. "System eth0" + or "Bond bond0". Use "dummy0" to create + a dummy interface with dummy route. + gateway_ip: IP of default gateway. Ignored when gateway_nmcli_connection == "dummy0". EOF } From 4abfaf65c9c82bc63f130407e711000540896566 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 16 Jan 2025 10:12:15 +0000 Subject: [PATCH 3/4] docs for cloud-init gateway --- README.md | 2 ++ docs/production.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/README.md b/README.md index 54b74d799..9626fb369 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ Before starting ensure that: - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). - Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. +- All instances will get a default route (needed for `k3s`). Normally the gateway + is defined on the relevant subnet. ### Setup deploy host diff --git a/docs/production.md b/docs/production.md index 59b9f3775..73489ac6d 100644 --- a/docs/production.md +++ b/docs/production.md @@ -122,6 +122,35 @@ and referenced from the `site` and `production` environments, e.g.: - If floating IPs are required for login nodes, modify the OpenTofu configurations appropriately. +- The main [README.md](../README.md) notes that all nodes require a default + route. This is to [allow k3s](https://docs.k3s.io/installation/airgap#default-network-route) + to detect the node's primary IP. Normally nodes get a default route from the + gateway defined on the subnet, but if networking must differ between hosts this + can be problematic. For example if the cluster has two networks with only + some nodes dual-homed, a gateway cannot be set on both subnets as this would + create routing problems for the dual-homed nodes. In this case set + `gateway_nmcli_connection = "dummy0"` in the OpenTofu compute group definition(s) + to create a dummy route using cloud-init as per the linked k3s docs, e.g.: + + ```terraform + # environments/$ENV/tofu/main.tf: + ... + compute = { + general = { + flavor = "general.v1.small" + nodes = [ + "general-0", + "general-1", + ] + gateway_nmcli_connection = "dummy0" + } + ... + ``` + + Note that the `gateway_nmcli_connection` and `gateway_ip` options can also be + used to set a real default route in cases where the gateway cannot be defined + on the subnet for some reason. + - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). From 5542656e1ce91770c24ebcb627d4c089e8e6954e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 16 Jan 2025 10:38:35 +0000 Subject: [PATCH 4/4] add cloud-init gateway setup for compute nodes --- .../terraform/compute.tf | 2 ++ .../terraform/compute/nodes.tf | 12 ++++++++++++ .../terraform/compute/variables.tf | 14 +++++++++++++- .../terraform/variables.tf | 5 +++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index baf28aaf9..f5c51433c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -19,6 +19,8 @@ module "compute" { volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + gateway_nmcli_connection = lookup(each.value, "gateway_nmcli_connection", "") + gateway_ip = lookup(each.value, "gateway_ip", "") compute_init_enable = lookup(each.value, "compute_init_enable", []) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 9bb75466e..fee03098d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -86,6 +86,18 @@ resource "openstack_compute_instance_v2" "compute" { user_data = <<-EOF #cloud-config fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + + runcmd: +%{ if var.gateway_nmcli_connection == "dummy0" ~} + - nmcli connection add type dummy ifname dummy0 con-name dummy0 + - nmcli connection modify dummy0 ipv4.address ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.gateway ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.route-metric 1000 ipv4.method manual +%{ endif ~} +%{ if (var.gateway_nmcli_connection != "") && (var.gateway_nmcli_connection != "dummy0") ~} + - nmcli connection modify '${var.gateway_nmcli_connection}' ipv4.address ${openstack_networking_port_v2.compute[each.key].all_fixed_ips[0]} ipv4.gateway ${var.gateway_ip} +%{ endif ~} +%{ if var.gateway_nmcli_connection != "" } + - nmcli connection up '${var.gateway_nmcli_connection}' +%{ endif ~} EOF } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index b0e489017..05d6b162e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -93,4 +93,16 @@ variable "compute_init_enable" { type = list(string) description = "Groups to activate for ansible-init compute rebuilds" default = [] -} \ No newline at end of file +} + +variable "gateway_nmcli_connection" { + description = "Name of nmcli connection for default gateway, '' for none or 'dummy0' to create a dummy interface" + type = string + default = "" +} + +variable "gateway_ip" { + description = "IP of default gateway. Ignored when gateway_nmcli_connection == 'dummy0'" + type = string + default = "" +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 7b0b695d3..9b1218906 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -60,6 +60,11 @@ variable "compute" { Values are a mapping with: size: Size of volume in GB **NB**: The order in /dev is not guaranteed to match the mapping + gateway_nmcli_connection: Name of nmcli connection to set a default + route on via cloud-init, e.g. "System eth0" + or "Bond bond0". Use "dummy0" to create + a dummy interface with dummy route. + gateway_ip: IP of default gateway. Ignored when gateway_nmcli_connection == "dummy0". EOF }