Skip to content

Commit 684123f

Browse files
committed
Merge remote-tracking branch 'origin/main' into ci/enable-linting
2 parents f6477c7 + b62a4c5 commit 684123f

File tree

17 files changed

+217
-14
lines changed

17 files changed

+217
-14
lines changed

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,5 @@ roles/*
9696
!roles/nhc/**
9797
!roles/eessi/
9898
!roles/eessi/**
99+
!roles/topology/
100+
!roles/topology/**

ansible/roles/proxy/tasks/main.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
- name: Validate http_proxy definition
22
ansible.builtin.assert:
3-
that: proxy_http_proxy != '' # this is default if squid not active
3+
that: proxy_http_proxy != ''
44
fail_msg: >-
55
Variable proxy_http_proxy cannot be the empty string for hosts in the
6-
proxy group. See environment/common/inventory/group_vars/all/proxy.yml.
6+
proxy group. See environments/common/inventory/group_vars/all/proxy.yml
7+
for convenience variables to set this.
78
- name: Define configuration in /etc/environment
89
tags: proxy
910
ansible.builtin.lineinfile:

ansible/roles/topology/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
topology
2+
========
3+
4+
Templates out /etc/slurm/topology.conf file based on an OpenStack project for use by
5+
Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models
6+
cluster as tree with a hierarchy of:
7+
8+
Top-level inter-rack Switch -> Availability Zones -> Hypervisors -> VMs
9+
10+
Warning: This role doesn't currently trigger a restart of Slurm so will therefore not
11+
reconfigure an already running cluster after a `ansible/site.yml` run. You will therefore need
12+
to run the `ansible/adhoc/restart-slurm.yml` playbook for changes to topology.conf to be
13+
recognised.
14+
15+
Role Variables
16+
--------------
17+
18+
- `topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`.
19+
- `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default
20+
`templates/topology.conf.j2`
21+
- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if
22+
you wish to partition racks further under different logical switches. New switches above should be
23+
defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing
24+
rack Availability Zones under that switch in their `Switches fields`. These switches must themselves
25+
be under a top level switch. e.g
26+
```
27+
topology_above_rack_topology: |
28+
SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2
29+
SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4
30+
SwitchName=top-level Switches=rack-group-1,rack-group-2
31+
```
32+
Defaults to an empty string, which causes all AZs to be put under a
33+
single top level switch.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Nodes to be included in topology tree, must include all Slurm compute nodes
2+
topology_nodes: []
3+
4+
# Override to use custom topology.conf template
5+
topology_conf_template: templates/topology.conf.j2
6+
7+
topology_above_rack_topology: ""
8+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/python
2+
3+
# Copyright: (c) 2025, StackHPC
4+
# Apache 2 License
5+
6+
from ansible.module_utils.basic import AnsibleModule
7+
import openstack
8+
9+
DOCUMENTATION = """
10+
---
11+
module: map_hosts
12+
short_description: Creates map of OpenStack VM network topology
13+
description:
14+
- Creates map representing the network topology tree of an OpenStack project with a heirarchy
15+
of: Availability Zone -> Hypervisors -> VMs/Baremetal instances
16+
options:
17+
compute_vms:
18+
description:
19+
- List of VM names within the target OpenStack project to include in the tree
20+
required: true
21+
type: str
22+
author:
23+
- Steve Brasier, William Tripp, StackHPC
24+
"""
25+
26+
RETURN = """
27+
topology:
28+
description:
29+
Map representing tree of project topology. Top level keys are AZ names, their values
30+
are maps of shortened unique identifiers of hosts UUIDs to lists of VM names
31+
returned: success
32+
type: dict[str, dict[str,list[str]]]
33+
sample:
34+
"nova-az":
35+
"afe9":
36+
- "mycluster-compute-0"
37+
- "mycluster-compute-1"
38+
"00f9":
39+
- "mycluster-compute-vm-on-other-hypervisor"
40+
"""
41+
42+
EXAMPLES = """
43+
- name: Get topology map
44+
map_hosts:
45+
compute_vms:
46+
- mycluster-compute-0
47+
- mycluster-compute-1
48+
"""
49+
50+
def min_prefix(uuids, start=4):
51+
""" Take a list of uuids and return the smallest length >= start which keeps them unique """
52+
for length in range(start, len(uuids[0])):
53+
prefixes = set(uuid[:length] for uuid in uuids)
54+
if len(prefixes) == len(uuids):
55+
return length
56+
57+
def run_module():
58+
module_args = dict(
59+
compute_vms=dict(type='list', elements='str', required=True)
60+
)
61+
module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
62+
63+
conn = openstack.connection.from_config()
64+
65+
servers = [s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"]]
66+
67+
topo = {}
68+
all_host_ids = []
69+
for s in servers:
70+
az = s['availability_zone']
71+
host_id = s['host_id']
72+
if host_id != '': # empty string if e.g. server is shelved
73+
all_host_ids.append(host_id)
74+
if az not in topo:
75+
topo[az] = {}
76+
if host_id not in topo[az]:
77+
topo[az][host_id] = []
78+
topo[az][host_id].append(s['name'])
79+
80+
uuid_len = min_prefix(list(set(all_host_ids)))
81+
82+
for az in topo:
83+
topo[az] = dict((k[:uuid_len], v) for (k, v) in topo[az].items())
84+
85+
result = {
86+
"changed": False,
87+
"topology": topo,
88+
}
89+
90+
module.exit_json(**result)
91+
92+
93+
def main():
94+
run_module()
95+
96+
97+
if __name__ == "__main__":
98+
main()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
- name: Map instances to hosts
2+
become: false
3+
map_hosts:
4+
compute_vms: "{{ topology_nodes }}"
5+
register: _topology
6+
delegate_to: localhost
7+
run_once: true
8+
9+
- name: Template topology.conf
10+
become: true
11+
ansible.builtin.template:
12+
src: "{{ topology_conf_template }}"
13+
dest: /etc/slurm/topology.conf
14+
owner: root
15+
group: root
16+
mode: 0644
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# topology.conf
2+
# Switch Configuration
3+
{% for az in _topology.topology.keys() %}
4+
{% for instance_host in _topology.topology[az].keys() %}
5+
SwitchName={{ instance_host }} Nodes={{ _topology.topology[az][instance_host] | join(",") }}
6+
{% endfor %}
7+
SwitchName={{ az }} Switches={{ _topology.topology[az].keys() | join(",") }}
8+
{% endfor %}
9+
{% if topology_above_rack_topology == '' %}
10+
SwitchName=master Switches={{ _topology.topology.keys() | join(",") }}
11+
{% else %}
12+
{{ topology_above_rack_topology }}
13+
{% endif %}

ansible/slurm.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,14 @@
6060
tags:
6161
- openhpc
6262
tasks:
63-
- ansible.builtin.include_role:
63+
- include_role:
64+
name: topology
65+
# Gated on topology group having compute nodes but role also
66+
# needs to run on control and login nodes
67+
when:
68+
- appliances_mode == 'configure'
69+
- groups['topology'] | length > 0
70+
- include_role:
6471
name: stackhpc.openhpc
6572
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}"
6673

environments/common/inventory/group_vars/all/openhpc.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ openhpc_config_default:
5858
- enable_configless
5959
TaskPlugin: task/cgroup,task/affinity
6060
ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0
61+
TopologyPlugin: "topology/{{ 'tree' if (topology_nodes | length) > 0 else 'flat' }}"
6162

6263
# default additional slurm.conf parameters when "rebuild" enabled:
6364
openhpc_config_rebuild:

environments/common/inventory/group_vars/all/proxy.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ proxy_remove: false
2121

2222
# full http proxy string - override if the above don't provide enough control:
2323
proxy_http_proxy: >-
24-
{% if groups['squid'] | length > 0 %}
24+
{%- if proxy_http_address != '' -%}
2525
http://
2626
{%- if proxy_basic_password -%}
2727
{{ proxy_basic_user }}:{{ proxy_basic_password }}@
2828
{%- endif -%}
2929
{{ proxy_http_address }}:{{ proxy_http_port }}
30-
{% else %}
30+
{%- else %}
3131
32-
{% endif %}
32+
{%- endif %}

0 commit comments

Comments
 (0)