Skip to content

Commit 53a7dc4

Browse files
committed
get resolv_conf, etc_hosts and stackhpc.openhpc working
1 parent cb21e9c commit 53a7dc4

File tree

1 file changed

+88
-9
lines changed

1 file changed

+88
-9
lines changed

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 88 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
vars:
77
os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}"
88
server_node_ip: "{{ os_metadata.meta.k3s_server }}"
9-
compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}"
9+
enable_slurmd: "{{ os_metadata.meta.enable_slurmd | default(false) | bool }}"
10+
enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}"
11+
enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}"
1012

1113
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
1214
# this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty
@@ -17,10 +19,10 @@
1719
- name: Report skipping initialization if not compute node
1820
# meta: end_play produces no output
1921
debug:
20-
msg: "Skipping compute initialization as metadata compute_groups is empty"
22+
msg: "Skipping compute initialization: Metadata enable_slurmd is not true"
2123

2224
- meta: end_play
23-
when: compute_groups | length == 0
25+
when: not enable_slurmd
2426

2527
- name: Ensure the mount directory exists
2628
file:
@@ -37,16 +39,15 @@
3739
fstype: nfs
3840
opts: ro,sync
3941
state: mounted
40-
register: nfs_mount_result
41-
ignore_errors: true
4242
register: _mount_mnt_cluster
43+
ignore_errors: true
4344
# TODO: add some retries here?
4445

4546
- block:
4647
- name: Report skipping initialization if cannot mount nfs
4748
# meta: end_play produces no output
4849
debug:
49-
msg: "Skipping compute initialization as cannot mount exports/cluster share"
50+
msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}"
5051

5152
- meta: end_play
5253
when: _mount_mnt_cluster.failed
@@ -56,6 +57,84 @@
5657
include_vars:
5758
file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname
5859

59-
- name: Demonstrate hostvars have loaded
60-
debug:
61-
var: prometheus_version
60+
# TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups?
61+
62+
- name: Configure resolve.conf
63+
block:
64+
- name: Set nameservers in /etc/resolv.conf
65+
ansible.builtin.template:
66+
src: /etc/ansible-init/templates/resolv.conf.j2
67+
dest: /etc/resolv.conf
68+
owner: root
69+
group: root
70+
mode: u=rw,og=r
71+
72+
- name: Disable NetworkManager control of resolv.conf
73+
ansible.builtin.copy:
74+
src: /etc/ansible-init/files/NetworkManager-dns-none.conf
75+
dest: /etc/NetworkManager/conf.d/90-dns-none.conf
76+
owner: root
77+
group: root
78+
mode: u=rw,og=r
79+
register: _copy_nm_config
80+
81+
- name: Reload NetworkManager
82+
ansible.builtin.systemd:
83+
name: NetworkManager
84+
state: reloaded
85+
when: _copy_nm_config.changed | default(false)
86+
when: enable_resolv_conf
87+
88+
- name: Copy cluster /etc/hosts
89+
copy:
90+
src: /mnt/cluster/hosts
91+
dest: /etc/hosts
92+
owner: root
93+
group: root
94+
mode: 0644
95+
when: enable_etc_hosts
96+
97+
# TODO: - name: NFS client mount
98+
99+
# TODO: - name: Manila mount
100+
101+
# TODO: - name: Basic users setup
102+
103+
# TODO: - name: Configure EESSI
104+
105+
# TODO: - name: Configure openhpc
106+
# NB: don't need conditional block on enable_slurmd as have already exited
107+
# if not the case
108+
- name: Write Munge key
109+
copy:
110+
content: "{{ openhpc_munge_key }}"
111+
dest: "/etc/munge/munge.key"
112+
owner: munge
113+
group: munge
114+
mode: 0400
115+
116+
- name: Set slurmctld location for configless operation
117+
lineinfile:
118+
path: /etc/sysconfig/slurmd
119+
line: "SLURMD_OPTIONS='--conf-server {{ server_node_ip }}'"
120+
regexp: "^SLURMD_OPTIONS="
121+
create: yes
122+
owner: root
123+
group: root
124+
mode: 0644
125+
126+
- name: Ensure Munge service state
127+
service:
128+
name: munge
129+
enabled: true
130+
state: started
131+
132+
- name: Ensure slurmd service state
133+
service:
134+
name: slurmd
135+
enabled: true
136+
state: started
137+
138+
- name: Ensure node is resumed
139+
# TODO: consider if this is always safe for all job states?
140+
command: scontrol update state=resume nodename={{ ansible_hostname }}

0 commit comments

Comments
 (0)