|
6 | 6 | vars:
|
7 | 7 | os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}"
|
8 | 8 | server_node_ip: "{{ os_metadata.meta.k3s_server }}"
|
9 |
| - compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" |
| 9 | + enable_slurmd: "{{ os_metadata.meta.enable_slurmd | default(false) | bool }}" |
| 10 | + enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" |
| 11 | + enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" |
10 | 12 |
|
11 | 13 | # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
|
12 | 14 | # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty
|
|
17 | 19 | - name: Report skipping initialization if not compute node
|
18 | 20 | # meta: end_play produces no output
|
19 | 21 | debug:
|
20 |
| - msg: "Skipping compute initialization as metadata compute_groups is empty" |
| 22 | + msg: "Skipping compute initialization: Metadata enable_slurmd is not true" |
21 | 23 |
|
22 | 24 | - meta: end_play
|
23 |
| - when: compute_groups | length == 0 |
| 25 | + when: not enable_slurmd |
24 | 26 |
|
25 | 27 | - name: Ensure the mount directory exists
|
26 | 28 | file:
|
|
37 | 39 | fstype: nfs
|
38 | 40 | opts: ro,sync
|
39 | 41 | state: mounted
|
40 |
| - register: nfs_mount_result |
41 |
| - ignore_errors: true |
42 | 42 | register: _mount_mnt_cluster
|
| 43 | + ignore_errors: true |
43 | 44 | # TODO: add some retries here?
|
44 | 45 |
|
45 | 46 | - block:
|
46 | 47 | - name: Report skipping initialization if cannot mount nfs
|
47 | 48 | # meta: end_play produces no output
|
48 | 49 | debug:
|
49 |
| - msg: "Skipping compute initialization as cannot mount exports/cluster share" |
| 50 | + msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" |
50 | 51 |
|
51 | 52 | - meta: end_play
|
52 | 53 | when: _mount_mnt_cluster.failed
|
|
56 | 57 | include_vars:
|
57 | 58 | file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname
|
58 | 59 |
|
59 |
| - - name: Demonstrate hostvars have loaded |
60 |
| - debug: |
61 |
| - var: prometheus_version |
| 60 | + # TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups? |
| 61 | + |
| 62 | + - name: Configure resolve.conf |
| 63 | + block: |
| 64 | + - name: Set nameservers in /etc/resolv.conf |
| 65 | + ansible.builtin.template: |
| 66 | + src: /etc/ansible-init/templates/resolv.conf.j2 |
| 67 | + dest: /etc/resolv.conf |
| 68 | + owner: root |
| 69 | + group: root |
| 70 | + mode: u=rw,og=r |
| 71 | + |
| 72 | + - name: Disable NetworkManager control of resolv.conf |
| 73 | + ansible.builtin.copy: |
| 74 | + src: /etc/ansible-init/files/NetworkManager-dns-none.conf |
| 75 | + dest: /etc/NetworkManager/conf.d/90-dns-none.conf |
| 76 | + owner: root |
| 77 | + group: root |
| 78 | + mode: u=rw,og=r |
| 79 | + register: _copy_nm_config |
| 80 | + |
| 81 | + - name: Reload NetworkManager |
| 82 | + ansible.builtin.systemd: |
| 83 | + name: NetworkManager |
| 84 | + state: reloaded |
| 85 | + when: _copy_nm_config.changed | default(false) |
| 86 | + when: enable_resolv_conf |
| 87 | + |
| 88 | + - name: Copy cluster /etc/hosts |
| 89 | + copy: |
| 90 | + src: /mnt/cluster/hosts |
| 91 | + dest: /etc/hosts |
| 92 | + owner: root |
| 93 | + group: root |
| 94 | + mode: 0644 |
| 95 | + when: enable_etc_hosts |
| 96 | + |
| 97 | + # TODO: - name: NFS client mount |
| 98 | + |
| 99 | + # TODO: - name: Manila mount |
| 100 | + |
| 101 | + # TODO: - name: Basic users setup |
| 102 | + |
| 103 | + # TODO: - name: Configure EESSI |
| 104 | + |
| 105 | + # TODO: - name: Configure openhpc |
| 106 | + # NB: don't need conditional block on enable_slurmd as have already exited |
| 107 | + # if not the case |
| 108 | + - name: Write Munge key |
| 109 | + copy: |
| 110 | + content: "{{ openhpc_munge_key }}" |
| 111 | + dest: "/etc/munge/munge.key" |
| 112 | + owner: munge |
| 113 | + group: munge |
| 114 | + mode: 0400 |
| 115 | + |
| 116 | + - name: Set slurmctld location for configless operation |
| 117 | + lineinfile: |
| 118 | + path: /etc/sysconfig/slurmd |
| 119 | + line: "SLURMD_OPTIONS='--conf-server {{ server_node_ip }}'" |
| 120 | + regexp: "^SLURMD_OPTIONS=" |
| 121 | + create: yes |
| 122 | + owner: root |
| 123 | + group: root |
| 124 | + mode: 0644 |
| 125 | + |
| 126 | + - name: Ensure Munge service state |
| 127 | + service: |
| 128 | + name: munge |
| 129 | + enabled: true |
| 130 | + state: started |
| 131 | + |
| 132 | + - name: Ensure slurmd service state |
| 133 | + service: |
| 134 | + name: slurmd |
| 135 | + enabled: true |
| 136 | + state: started |
| 137 | + |
| 138 | + - name: Ensure node is resumed |
| 139 | + # TODO: consider if this is always safe for all job states? |
| 140 | + command: scontrol update state=resume nodename={{ ansible_hostname }} |
0 commit comments