Skip to content

Commit 8ac8af2

Browse files
Merge pull request #97 from oci-hpc/2.10.1_ds_updates
topology.conf in sync with bastion on backup
2 parents b195d49 + d26442c commit 8ac8af2

File tree

11 files changed

+109
-45
lines changed

11 files changed

+109
-45
lines changed

playbooks/destroy.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
- hosts: compute
1+
- hosts: compute, slurm_backup
22
become: true
33
vars:
44
destroy: true
@@ -9,7 +9,7 @@
99
- include_role:
1010
name: slurm
1111
when: slurm|default(false)|bool
12-
- hosts: bastion
12+
- hosts: bastion, slurm_backup, login
1313
become: true
1414
vars:
1515
destroy: true

playbooks/new_nodes.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@
173173
name: latency_check
174174
when: cluster_network|bool and not 'GPU' in shape
175175

176-
- hosts: compute
176+
- hosts: compute, slurm_backup
177177
vars:
178178
destroy: false
179179
initial: false
@@ -193,4 +193,4 @@
193193
when: slurm|default(false)|bool
194194
- include_role:
195195
name: telegraf
196-
when: monitoring|default(false)|bool
196+
when: monitoring|default(false)|bool

playbooks/resize_remove.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
- include_role:
1010
name: etc-hosts
1111

12-
- hosts: compute_to_destroy
12+
- hosts: compute_to_destroy, slurm_backup
1313
become: true
1414
vars:
1515
destroy: true

playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,18 @@
255255
delegate_to: 127.0.0.1
256256
when: ('bastion' in group_names)
257257

258+
- name: move topology.conf on backup servers
259+
become: true
260+
copy:
261+
dest: '{{ slurm_conf_path }}/topology.conf'
262+
src: '{{ slurm_conf_path }}/topology.conf'
263+
force: yes
264+
register: topology_copied
265+
until: topology_copied is not failed
266+
retries: 10
267+
delay: 5
268+
when: ('slurm_backup' in group_names)
269+
258270
- name: Reconfigure Slurm for topology
259271
become: true
260272
command: "scontrol reconfigure"

playbooks/roles/destroy_unreachable/tasks/slurm.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,18 @@
145145
delegate_to: 127.0.0.1
146146
when: ('bastion' in group_names)
147147

148+
- name: move topology.conf on backup servers
149+
become: true
150+
copy:
151+
dest: '{{ slurm_conf_path }}/topology.conf'
152+
src: '{{ slurm_conf_path }}/topology.conf'
153+
force: yes
154+
register: topology_copied
155+
until: topology_copied is not failed
156+
retries: 10
157+
delay: 5
158+
when: ('slurm_backup' in group_names)
159+
148160
- name: Reconfigure Slurm for topology
149161
become: true
150162
command: "scontrol reconfigure"

playbooks/roles/rack-aware/files/node_ordering_by_rack.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
11
#!/usr/bin/env python3
2-
from pssh.clients import ParallelSSHClient
32
import json
4-
import sys, getopt
53
import os
64
import argparse
7-
from operator import itemgetter
8-
from collections import OrderedDict
5+
import subprocess
96

107
def write_ordered_hostfile(ordered_hosts=[],hostfile=None):
118
#ordered_hostfile="ordered_hostfile"
@@ -43,28 +40,47 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None):
4340
#with open('/etc/opt/oci-hpc/hostfile.tcp', 'r') as f:
4441
hosts = f.read().splitlines()
4542

46-
client = ParallelSSHClient(hosts)
47-
output = client.run_command('curl http://169.254.169.254/opc/v1/host/')
48-
#print(output)
4943

5044
r = {}
51-
for host_out in output:
52-
j = json.loads(bytearray(''.join(list(host_out.stdout)).encode()))
53-
#print(j)
54-
if j['rackId'] in r:
55-
r[j['rackId']].append( host_out.host )
56-
else:
57-
r[j['rackId']] = [ host_out.host ]
58-
59-
6045
friendly_name_to_system_hostname = {}
61-
hostname_output = client.run_command('/usr/bin/hostname')
62-
#print(hostname_output)
63-
for host_out in hostname_output:
64-
#j = bytearray(''.join(list(host_out.stdout)).encode())
65-
j = bytearray(''.join(list(host_out.stdout)).encode())
66-
friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii')
67-
#print(j.decode(encoding='ascii')+" "+host_out.host)
46+
try:
47+
from pssh.clients import ParallelSSHClient
48+
client = ParallelSSHClient(hosts)
49+
output = client.run_command('curl http://169.254.169.254/opc/v1/host/')
50+
#print(output)
51+
for host_out in output:
52+
j = json.loads(bytearray(''.join(list(host_out.stdout)).encode()))
53+
#print(j)
54+
if j['rackId'] in r:
55+
r[j['rackId']].append( host_out.host )
56+
else:
57+
r[j['rackId']] = [ host_out.host ]
58+
hostname_output = client.run_command('/usr/bin/hostname')
59+
#print(hostname_output)
60+
for host_out in hostname_output:
61+
#j = bytearray(''.join(list(host_out.stdout)).encode())
62+
j = bytearray(''.join(list(host_out.stdout)).encode())
63+
friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii')
64+
#print(j.decode(encoding='ascii')+" "+host_out.host)
65+
except ImportError:
66+
try:
67+
for h in hosts:
68+
out = subprocess.run(["ssh "+h+" \"curl -s http://169.254.169.254/opc/v1/host/\""],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True)
69+
x = out.stdout.splitlines()
70+
del x[-1]
71+
del x[0]
72+
rackId_str = x[1].split(":")[1].replace('"','')
73+
rackId = rackId_str.replace(' ','')
74+
if rackId in r:
75+
r[rackId].append( h )
76+
else:
77+
r[rackId] = [ h ]
78+
for h in hosts:
79+
out = subprocess.run(["ssh "+h+" /usr/bin/hostname"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True)
80+
x = out.stdout.splitlines()
81+
friendly_name_to_system_hostname[h] = x[0]
82+
except subprocess.CalledProcessError as e_process_error:
83+
exit(f"Error code: {e_process_error.returncode} Output: {e_process_error.output}")
6884

6985

7086
ordered_hosts = []

playbooks/roles/slurm/tasks/el7.yml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,6 @@
1919
include_tasks: server.yml
2020
when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool)
2121

22-
- name: run backup server directives
23-
vars:
24-
slurm_repos: "epel,ol7_developer_EPEL"
25-
include_tasks: backup_server.yml
26-
when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool)
27-
2822
- name: run compute directives
2923
vars:
3024
slurm_repos: "epel,ol7_developer_EPEL"
@@ -37,10 +31,20 @@
3731
include_tasks: login.yml
3832
when: ('login' in group_names) and (not destroy|bool) and (initial| bool)
3933

34+
- name: run backup server directives
35+
vars:
36+
slurm_repos: "epel,ol7_developer_EPEL"
37+
include_tasks: backup_server.yml
38+
when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool)
39+
4040
- name: cleanup
4141
include_tasks: cleanup.yml
4242
when: ('compute' in group_names) and (not destroy|bool)
4343

4444
- name: destroy
4545
include_tasks: destroy{{rack_aware_playbook_suffix}}.yml
4646
when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool)
47+
48+
- name: move topology.conf on backup slurm controller
49+
include_tasks: move-topology.yml
50+
when: ('slurm_backup' in group_names) and (not initial| bool)

playbooks/roles/slurm/tasks/el8.yml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,6 @@
1919
include_tasks: server.yml
2020
when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool)
2121

22-
- name: run backup server directives
23-
vars:
24-
slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder"
25-
include_tasks: backup_server.yml
26-
when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool)
27-
2822
- name: run compute directives
2923
vars:
3024
slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder"
@@ -37,10 +31,20 @@
3731
include_tasks: login.yml
3832
when: ('login' in group_names) and (not destroy|bool) and (initial| bool)
3933

34+
- name: run backup server directives
35+
vars:
36+
slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder"
37+
include_tasks: backup_server.yml
38+
when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool)
39+
4040
- name: cleanup
4141
include_tasks: cleanup.yml
4242
when: ('compute' in group_names) and (not destroy|bool)
4343

4444
- name: destroy
4545
include_tasks: destroy{{rack_aware_playbook_suffix}}.yml
4646
when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool)
47+
48+
- name: move topology.conf on backup slurm controller
49+
include_tasks: move-topology.yml
50+
when: ('slurm_backup' in group_names) and (not initial| bool)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
3+
- name: move topology.conf on backup servers
4+
become: true
5+
copy:
6+
dest: '{{ slurm_conf_path }}/topology.conf'
7+
src: '{{ slurm_conf_path }}/topology.conf'
8+
force: yes
9+
register: topology_copied
10+
until: topology_copied is not failed
11+
retries: 10
12+
delay: 5

playbooks/roles/slurm/tasks/ubuntu.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@
66
include_tasks: server.yml
77
when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool)
88

9-
- name: run backup server directives
10-
include_tasks: backup_server.yml
11-
when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool)
12-
139
- name: run compute directives
1410
include_tasks: "compute{{rack_aware_playbook_suffix}}.yml"
1511
when: ('compute' in group_names) and (not destroy|bool)
1612

1713
- name: run login server directives
1814
include_tasks: login.yml
1915
when: ('login' in group_names) and (not destroy|bool) and (initial| bool)
16+
17+
- name: run backup server directives
18+
include_tasks: backup_server.yml
19+
when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool)
2020

2121
- name: cleanup
2222
include_tasks: cleanup.yml
@@ -25,3 +25,7 @@
2525
- name: destroy
2626
include_tasks: destroy{{rack_aware_playbook_suffix}}.yml
2727
when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool)
28+
29+
- name: move topology.conf on backup slurm controller
30+
include_tasks: move-topology.yml
31+
when: ('slurm_backup' in group_names) and (not initial| bool)

0 commit comments

Comments
 (0)