Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,5 @@ roles/*
!roles/pytools/**
!roles/rebuild/
!roles/rebuild/**
!roles/gateway/
!roles/gateway/**
17 changes: 16 additions & 1 deletion ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,16 @@
stat:
path: /etc/systemd/system/ansible-init.service
register: _stat_ansible_init_unitfile

- name: Check ansible-init status
command: systemctl is-failed ansible-init
register: _ansible_init_failed
failed_when: false # rc != 0 for non-failure!
changed_when: false
- name: Check ansible-init hasn't failed (yet)
# NB: only allows early exit if it has, does not catch future failures!
assert:
that: "'failed' not in _ansible_init_failed.stdout"
fail_msg: "ansible-init has failed - check journalctl -xeu ansible-init"
- name: Wait for ansible-init to finish
wait_for:
path: /var/lib/ansible-init.done
Expand Down Expand Up @@ -58,6 +67,12 @@
tasks:
- import_role:
name: mrlesmithjr.chrony
# skip install tasks as might not have network yet
tasks_from: config_chrony.yml
vars:
# workaround for set_facts.yml:
chrony_config: /etc/chrony.conf
chrony_service: chronyd

- hosts: cluster
gather_facts: false
Expand Down
16 changes: 15 additions & 1 deletion ansible/cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,21 @@

- name: Cleanup /tmp
command : rm -rf /tmp/*


- name: Delete files triggering vulnerability scans
ansible.builtin.file:
path: "{{ item }}"
state: absent
loop: # NB: items here MUST have a justification!
# ondemand install: raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw
# All declared not to be an issue by Open Ondemand as relevant packages not installed
- "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock"
- "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock"
- /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock
# chrony role: only used for role dev, venv never created on disk
- /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/poetry.lock
- /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/requirements.txt

- name: Get package facts
package_facts:

Expand Down
11 changes: 10 additions & 1 deletion ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
- import_playbook: extras.yml

# TODO: is this the right place?
- name: Install compute_init script
- name: Install compute_init playbook
hosts: compute_init
tags: compute_init # tagged to allow running on cluster instances for dev
become: yes
Expand All @@ -88,6 +88,15 @@
name: compute_init
tasks_from: install.yml

- name: Install gateway playbook
hosts: gateway
tags: gateway
become: yes
gather_facts: no
tasks:
- include_role:
name: gateway

- hosts: builder
become: yes
gather_facts: yes
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/compute_init/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
- name: Add compute initialisation playbook
copy:
src: compute-init.yml
dest: /etc/ansible-init/playbooks/1-compute-init.yml
dest: /etc/ansible-init/playbooks/10-compute-init.yml
owner: root
group: root
mode: 0644
90 changes: 90 additions & 0 deletions ansible/roles/gateway/files/gateway-init.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
- hosts: localhost
#become: true
gather_facts: false
vars:
os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}"
gateway_ip: "{{ os_metadata.meta.gateway_ip | default('') }}"
access_ip: "{{ os_metadata.meta.access_ip | default('') }}"
tasks:
- name: Read nmcli device info
command: nmcli --get GENERAL.DEVICE,GENERAL.CONNECTION,IP4.ADDRESS,IP4.GATEWAY device show
register: _nmcli_device_raw
changed_when: false

- name: Set fact for nmcli devices
set_fact:
# creates a dict with keys as per zip arg below, values might be ''
nmcli_devices: >-
{{
_nmcli_device_raw.stdout_lines |
batch(5, '') |
map('zip', ['device', 'connection', 'ip4_address', 'ip4_gateway']) |
map('map', 'reverse') | map('community.general.dict')
}}
# batch=5 because per device have 4x lines + blank line between devices
# batch takes default '' because last devices doesn't have trailing blank line

- name: Examine whether device address contains gateway_ip
set_fact:
device_is_gateway_device: "{{ nmcli_devices | map(attribute='ip4_address') | map('ansible.utils.network_in_network', gateway_ip) }}"
# list of bools - false if gateway_ip == ''

- name: Get name of connection containing gateway_ip
# might be empty string
set_fact:
gateway_ip_connection: >-
{{ nmcli_devices | map(attribute='connection') |
zip(device_is_gateway_device) | selectattr('1') |
map(attribute=0) | list | first | default ('') }}

- name: Show debug info
debug:
msg: "gateway_ip={{ gateway_ip }} access_ip={{ access_ip }} gateway_ip_connection={{ gateway_ip_connection }}"

- name: Error if device has a gateway which is not the desired one
# TODO: document
assert:
that: item.gateway == gateway_ip
fail_msg: "Device {{ item | to_nice_json }} has gateway: cannot apply gateway {{ gateway_ip }}"
when:
- item.connection == gateway_ip_connection
- item.ip4_gateway != ''
- item.ip4_gateway != gateway_ip
loop: "{{ nmcli_devices }}"

- name: Remove undesired gateways
shell: |
nmcli connection modify '{{ item.connection }}' ipv4.gateway ''
nmcli connection up '{{ item.connection }}'
when:
- gateway_ip != ''
- item.ip4_gateway != ''
- item.connection != gateway_ip_connection
loop: "{{ nmcli_devices }}"

- name: Add desired gateways # TESTED OK
shell: |
nmcli connection modify '{{ item.connection }}' \
ipv4.address {{ item.ip4_address }} \
ipv4.gateway {{ gateway_ip }}
nmcli connection up '{{ item.connection }}'
when:
- gateway_ip != ''
- item.ip4_gateway != gateway_ip
- item.connection == gateway_ip_connection
loop: "{{ nmcli_devices }}"

- name: Create dummy connection and gateway # TESTED OK
# see https://docs.k3s.io/installation/airgap#default-network-route
shell: |
nmcli connection add type dummy ifname dummy0 con-name dummy0
nmcli connection modify dummy0 \
ipv4.address {{ access_ip }} \
ipv4.gateway {{ access_ip }} \
ipv4.route-metric 1000 \
ipv4.method manual
nmcli connection up dummy0
when:
- gateway_ip == '' # no gateway specified
- nmcli_devices | selectattr('ip4_gateway', 'ne', '') | length == 0
# no gateway from networks
7 changes: 7 additions & 0 deletions ansible/roles/gateway/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- name: Add gateway playbook
copy:
src: gateway-init.yml
dest: /etc/ansible-init/playbooks/05-gateway-init.yml
owner: root
group: root
mode: 0644
4 changes: 4 additions & 0 deletions ansible/roles/proxy/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,14 @@
path: /etc/systemd/system.conf.d/90-proxy.conf
section: Manager
option: DefaultEnvironment
# k3s uses uppercase: https://docs.k3s.io/advanced#configuring-an-http-proxy
value: >-
"http_proxy={{ proxy_http_proxy }}"
"https_proxy={{ proxy_http_proxy }}"
"no_proxy={{ proxy_no_proxy }}"
"HTTP_PROXY={{ proxy_http_proxy }}"
"HTTPS_PROXY={{ proxy_https_proxy }}"
"NO_PROXY={{ proxy_no_proxy }}"
no_extra_spaces: true
owner: root
group: root
Expand Down
103 changes: 94 additions & 9 deletions docs/networks.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ subnets or associated infrastructure such as routers. The requirements are that:
4. At least one network on each node provides outbound internet access (either
directly, or via a proxy).

Futhermore, it is recommended that the deploy host has an interface on the
access network. While it is possible to e.g. use a floating IP on a login node
as an SSH proxy to access the other nodes, this can create problems in recovering
the cluster if the login node is unavailable and can make Ansible problems harder
to debug.
Addresses on the "access network" are used for `ansible_host` and `k3s` node IPs.

It is recommended that the deploy host either has a direct connection to the
"access network" or jumps through a host on it which is not part of the appliance.
Using e.g. a floating IP on a login node as a jumphost creates problems in
recovering the cluster if the login node is unavailable and can make Ansible
problems harder to debug.

> [!WARNING]
> If home directories are on a shared filesystem with no authentication (such
Expand All @@ -29,8 +31,8 @@ the OpenTofu variables. These will normally be set in
need to be overriden for specific environments, this can be done via an OpenTofu
module as discussed [here](./production.md).

Note that if an OpenStack subnet has a gateway IP defined then nodes with ports
attached to that subnet will get a default route set via that gateway.
Note that if an OpenStack subnet has a gateway IP defined then by default nodes
with ports attached to that subnet get a default route set via that gateway.

## Single network
This is the simplest possible configuration. A single network and subnet is
Expand Down Expand Up @@ -77,8 +79,9 @@ vnic_types = {
## Additional networks on some nodes

This example shows how to modify variables for specific node groups. In this
case a baremetal node group has a second network attached. As above, only a
single subnet can have a gateway IP.
case a baremetal node group has a second network attached. Here "subnetA" must
have a gateway IP defined and "subnetB" must not, to avoid routing problems on
the multi-homeed compute nodes.

```terraform
cluster_networks = [
Expand Down Expand Up @@ -109,3 +112,85 @@ compute = {
}
...
```

## Multiple networks with non-default gateways

In some multiple network configurations it may be necessary to manage default
routes rather than them being automatically created from a subnet gateway.
This can be done using the tofu variable `gateway_ip` which can be set for the
cluster and/or overriden on the compute and login groups. If this is set:
- a default route via that address will be created on the appropriate interface
during boot if it does not exist
- any other default routes will be removed

For example the cluster configuration below has a "campus" network with a
default gateway which provides inbound SSH / ondemand access and outbound
internet attached only to the login nodes, and a "data" network attached to
all nodes. The "data" network has no gateway IP set on its subnet to avoid dual
default routes and routing conflicts on the multi-homed login nodes, but does
have outbound connectivity via a router:

```terraform
cluster_networks = [
{
network = "data" # access network, CIDR 172.16.0.0/23
subnet = "data_subnet"
}
]

login = {
interactive = {
nodes = ["login-0"]
extra_networks = [
{
network = "campus"
subnet = "campus_subnet"
}
]
}
}
compute = {
general = {
nodes = ["compute-0", "compute-1"]
}
gateway_ip = "172.16.0.1" # Router interface
}
```

If there is no default route at all (either from a subnet gateway or from
`gateway_ip`) then a dummy route is created via the access network interface to
ensure [correct](https://docs.k3s.io/installation/airgap#default-network-route)
`k3s` operation.

When using a subnet with no default gateway, OpenStack's nameserver for the
subnet may refuse lookups. External nameservers can be defined using the
[resolv_conf](../ansible/roles/resolv_conf/README.md) role.

## Proxies

If some nodes have no outbound connectivity via any networks, the cluster can
be configured to deploy a [squid proxy](https://www.squid-cache.org/) on a node
with outbound connectivity. Assuming the `compute` and `control` nodes have no
outbound connectivity and the `login` node does, the minimal configuration for
this is:

```yaml
# environments/$SITE/inventory/groups:
[squid:children]
login
[proxy:children]
control
compute
```

```yaml
# environments/$SITE/inventory/group_vars/all/squid.yml:
# these are just examples
squid_cache_disk: 1024 # MB
squid_cache_mem: '12 GB'
```

Note that name resolution must still be possible and may require defining an
nameserver which is directly reachable from the node using the
[resolv_conf](../ansible/roles/resolv_conf/README.md)
role.
14 changes: 0 additions & 14 deletions environments/.stackhpc/hooks/post.yml

This file was deleted.

4 changes: 2 additions & 2 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-250304-1029-4d6dee03",
"RL9": "openhpc-RL9-250304-1029-4d6dee03"
"RL8": "openhpc-RL8-250305-1707-6eac0e27",
"RL9": "openhpc-RL9-250305-1707-6eac0e27"
}
}
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
ansible_init_wait: 1200 # seconds
ansible_init_wait: 1200 # seconds
ansible_init_pip_packages:
- ansible
- jmespath
- requests
# custom below here
- netaddr
3 changes: 3 additions & 0 deletions environments/common/inventory/groups
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,6 @@ extra_packages

[chrony]
# Hosts where crony configuration is applied. See docs/chrony.md for more details.

[gateway]
# Add builder to this group to install gateway ansible-init playbook into image
4 changes: 4 additions & 0 deletions environments/common/layouts/everything
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,7 @@ builder

[chrony]
# Hosts where crony configuration is applied. See docs/chrony.md for more details.

[gateway:children]
# Add builder to this group to install gateway ansible-init playbook into image
builder
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ module "compute" {
vnic_types = lookup(each.value, "vnic_types", var.vnic_types)
volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances)
root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size)
gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip)

# optionally set for group:
networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", []))
Expand Down
Loading
Loading