Skip to content

Commit be4a657

Browse files
authored
Merge branch 'main' into docs/terraform-to-tofu
2 parents 5310b7d + 6569a37 commit be4a657

File tree

15 files changed

+189
-87
lines changed

15 files changed

+189
-87
lines changed

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,19 @@ Create an OpenTofu variables file to define the required infrastructure, e.g.:
8989
cluster_subnet = "some_subnet" # *
9090
key_pair = "my_key" # *
9191
control_node_flavor = "some_flavor_name"
92-
login_nodes = {
93-
login-0: "login_flavor_name"
92+
login = {
93+
# Arbitrary group name for these login nodes
94+
interactive = {
95+
nodes: ["login-0"]
96+
flavor: "login_flavor_name" # *
97+
}
9498
}
9599
cluster_image_id = "rocky_linux_9_image_uuid"
96100
compute = {
101+
# Group name used for compute node partition definition
97102
general = {
98103
nodes: ["compute-0", "compute-1"]
99-
flavor: "compute_flavor_name"
104+
flavor: "compute_flavor_name" # *
100105
}
101106
}
102107

ansible/bootstrap.yml

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -127,21 +127,35 @@
127127
that: dnf_repos_password is undefined
128128
fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password'
129129
when: appliances_mode == 'configure'
130-
- name: Replace system repos with pulp repos
131-
ansible.builtin.include_role:
132-
name: dnf_repos
133-
tasks_from: set_repos.yml
134130

135-
# --- tasks after here require access to package repos ---
136131
- hosts: squid
137132
tags: squid
138133
gather_facts: yes
139134
become: yes
140135
tasks:
136+
# - Installing squid requires working dnf repos
137+
# - Configuring dnf_repos itself requires working dnf repos to install epel
138+
# - Hence do this on squid nodes first in case they are proxying others
139+
- name: Replace system repos with pulp repos
140+
ansible.builtin.include_role:
141+
name: dnf_repos
142+
tasks_from: set_repos.yml
143+
when: "'dnf_repos' in group_names"
141144
- name: Configure squid proxy
142145
import_role:
143146
name: squid
144147

148+
- hosts: dnf_repos
149+
tags: dnf_repos
150+
gather_facts: yes
151+
become: yes
152+
tasks:
153+
- name: Replace system repos with pulp repos
154+
ansible.builtin.include_role:
155+
name: dnf_repos
156+
tasks_from: set_repos.yml
157+
158+
# --- tasks after here require general access to package repos ---
145159
- hosts: tuned
146160
tags: tuned
147161
gather_facts: yes

ansible/ci/check_slurm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
77
register: sinfo
88
changed_when: false
9-
until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout)
9+
until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout or "down" in sinfo.stdout)
1010
retries: 10
1111
delay: 5
1212
- name: Check nodes have expected slurm state

docs/experimental/compute-init.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22

33
See the role README.md
44

5+
# Changes to image / tofu state
6+
7+
When a compute group has the `ignore_image_changes` parameter set to true,
8+
changes to the `image_id` parameter (which defaults to `cluster_image_id`) are
9+
ignored by OpenTofu.
10+
11+
Regardless of whether `ignore_image_changes` is set, OpenTofu templates out the
12+
`image_id` into the Ansible inventory for each compute node. The `compute_init`
13+
role templates out hostvars to the control node, which means the "target" image
14+
ID is then available on the control node. Subsequent work will use this to
15+
rebuild the node via slurm.
16+
517
# CI workflow
618

719
The compute node rebuild is tested in CI after the tests for rebuilding the
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250115-1510-99f67c6d",
4-
"RL9": "openhpc-RL9-250115-1510-99f67c6d"
3+
"RL8": "openhpc-RL8-250122-1150-a0899ef8",
4+
"RL9": "openhpc-RL9-250122-1150-a0899ef8"
55
}
66
}

environments/.stackhpc/tofu/main.tf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,18 @@ module "cluster" {
7373
# are not in the same environment for stackhpc
7474
inventory_secrets_path = "${path.module}/../inventory/group_vars/all/secrets.yml"
7575

76-
login_nodes = {
77-
login-0: var.other_node_flavor
76+
login = {
77+
login: {
78+
nodes: ["login-0"]
79+
flavor: var.other_node_flavor
80+
}
7881
}
7982
compute = {
8083
standard: { # NB: can't call this default!
8184
nodes: ["compute-0", "compute-1"]
8285
flavor: var.other_node_flavor
8386
compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"]
87+
# ignore_image_changes: true
8488
}
8589
# Example of how to add another partition:
8690
# extra: {

environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
module "compute" {
2-
source = "./compute"
2+
source = "./node_group"
33

44
for_each = var.compute
55

@@ -21,6 +21,7 @@ module "compute" {
2121
extra_volumes = lookup(each.value, "extra_volumes", {})
2222

2323
compute_init_enable = lookup(each.value, "compute_init_enable", [])
24+
ignore_image_changes = lookup(each.value, "ignore_image_changes", false)
2425

2526
key_pair = var.key_pair
2627
environment_root = var.environment_root

environments/skeleton/{{cookiecutter.environment}}/tofu/nodes.tf renamed to environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,6 @@ locals {
22
control_volumes = concat([openstack_blockstorage_volume_v3.state], var.home_volume_size > 0 ? [openstack_blockstorage_volume_v3.home][0] : [])
33
}
44

5-
resource "openstack_networking_port_v2" "login" {
6-
7-
for_each = var.login_nodes
8-
9-
name = "${var.cluster_name}-${each.key}"
10-
network_id = data.openstack_networking_network_v2.cluster_net.id
11-
admin_state_up = "true"
12-
13-
fixed_ip {
14-
subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id
15-
}
16-
17-
security_group_ids = [for o in data.openstack_networking_secgroup_v2.login: o.id]
18-
19-
binding {
20-
vnic_type = var.vnic_type
21-
profile = var.vnic_profile
22-
}
23-
}
24-
255
resource "openstack_networking_port_v2" "control" {
266

277
name = "${var.cluster_name}-control"
@@ -96,42 +76,3 @@ resource "openstack_compute_instance_v2" "control" {
9676
EOF
9777

9878
}
99-
100-
resource "openstack_compute_instance_v2" "login" {
101-
102-
for_each = var.login_nodes
103-
104-
name = "${var.cluster_name}-${each.key}"
105-
image_id = var.cluster_image_id
106-
flavor_name = each.value
107-
key_pair = var.key_pair
108-
109-
dynamic "block_device" {
110-
for_each = var.volume_backed_instances ? [1]: []
111-
content {
112-
uuid = var.cluster_image_id
113-
source_type = "image"
114-
destination_type = "volume"
115-
volume_size = var.root_volume_size
116-
boot_index = 0
117-
delete_on_termination = true
118-
}
119-
}
120-
121-
network {
122-
port = openstack_networking_port_v2.login[each.key].id
123-
access_network = true
124-
}
125-
126-
metadata = {
127-
environment_root = var.environment_root
128-
k3s_token = local.k3s_token
129-
control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]
130-
}
131-
132-
user_data = <<-EOF
133-
#cloud-config
134-
fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix}
135-
EOF
136-
137-
}

environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ resource "local_file" "hosts" {
44
"cluster_name": var.cluster_name,
55
"cluster_domain_suffix": var.cluster_domain_suffix,
66
"control_instances": openstack_compute_instance_v2.control
7-
"login_instances": openstack_compute_instance_v2.login
7+
"login_groups": module.login
88
"compute_groups": module.compute
99
"state_dir": var.state_dir
1010
},

environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,22 @@ control:
1313
vars:
1414
appliances_state_dir: ${state_dir} # NB needs to be set on group not host otherwise it is ignored in packer build!
1515

16-
login:
16+
17+
%{ for group_name in keys(login_groups) ~}
18+
${cluster_name}_${group_name}:
1719
hosts:
18-
%{ for login in login_instances ~}
19-
${ login.name }:
20-
ansible_host: ${[for n in login.network: n.fixed_ip_v4 if n.access_network][0]}
21-
instance_id: ${ login.id }
20+
%{ for node in login_groups[group_name]["compute_instances"] ~}
21+
${ node.name }:
22+
ansible_host: ${node.access_ip_v4}
23+
instance_id: ${ node.id }
24+
image_id: ${ node.image_id }
25+
%{ endfor ~}
26+
%{ endfor ~}
27+
28+
login:
29+
children:
30+
%{ for group_name in keys(login_groups) ~}
31+
${cluster_name}_${group_name}:
2232
%{ endfor ~}
2333

2434
%{ for group_name in keys(compute_groups) ~}
@@ -28,6 +38,7 @@ ${cluster_name}_${group_name}:
2838
${ node.name }:
2939
ansible_host: ${node.access_ip_v4}
3040
instance_id: ${ node.id }
41+
image_id: ${ node.image_id }
3142
%{ endfor ~}
3243
%{ endfor ~}
3344

0 commit comments

Comments
 (0)