1+ #!/usr/bin/env python3
2+ import json
3+ import os
4+ import argparse
5+ import subprocess
6+ import sys
7+
8+ def write_ordered_hostfile (ordered_hosts = [],hostfile = None ,srun = False ):
9+ #ordered_hostfile="ordered_hostfile"
10+ if os .path .isfile (hostfile ):
11+ os .remove (hostfile )
12+ fhandler = open (hostfile ,"w" )
13+ for h in ordered_hosts :
14+ if srun :
15+ for x in range (8 ):
16+ fhandler .write (h + "\n " )
17+ else :
18+ fhandler .write (h + "\n " )
19+ fhandler .close ()
20+
21+ def write_ordered_rankfile (ordered_hosts = [],hostfile = None ):
22+ if os .path .isfile (hostfile ):
23+ os .remove (hostfile )
24+ fhandler = open (hostfile ,"w" )
25+ for index ,h in enumerate (ordered_hosts ):
26+ for gpu_index in range (gpus ):
27+ fhandler .write ("rank " + str (index * gpus + gpu_index )+ "=" + h + " slot=" + str (gpu_index )+ "\n " )
28+ fhandler .close ()
29+
30+
31+ def get_swicthname (host ):
32+ try :
33+ command = "scontrol show topology " + host + " | grep Level=0"
34+ result = subprocess .run (command , shell = True , stdout = subprocess .PIPE , stderr = subprocess .PIPE , universal_newlines = True )
35+ switchname = result .stdout .split (" " )[0 ].replace ("SwitchName=" ,"" )
36+ return switchname
37+ except Exception as e :
38+ print (f"Error grabbing switchname: { e } " )
39+ sys .exit (1 )
40+
41+ gpus = 8
42+ parser = argparse .ArgumentParser (description = 'Script to order hostnames for optimal performance based on rack Id' )
43+ parser .add_argument ('--input_file' , help = 'Path of the input file which has host names. One hostname on each line in the file' )
44+ args = parser .parse_args ()
45+
46+ if args .input_file is None :
47+ input_file = ''
48+ #/etc/opt/oci-hpc/hostfile.tcp'
49+ exit ()
50+ else :
51+ input_file = args .input_file
52+
53+ with open (input_file , 'r' ) as f :
54+ #with open('./hostfile', 'r') as f:
55+ #with open('/etc/opt/oci-hpc/hostfile.tcp', 'r') as f:
56+ hosts = f .read ().splitlines ()
57+
58+
59+ r = {}
60+ friendly_name_to_system_hostname = {}
61+ for i in hosts :
62+ print (i )
63+
64+ topology_command = "scontrol show topology"
65+ try :
66+ result = subprocess .run (topology_command , shell = True , stdout = subprocess .PIPE , stderr = subprocess .PIPE , universal_newlines = True )
67+ if result .stderr == "" :
68+ slurm = True
69+ else :
70+ slurm = False
71+ except :
72+ slurm = False
73+ if slurm :
74+ for host in hosts :
75+ switch = get_swicthname (host )
76+ if switch in r .keys ():
77+ r [switch ].append ( host )
78+ else :
79+ r [switch ] = [ host ]
80+ friendly_name_to_system_hostname [host ]= host
81+ else :
82+ try :
83+ from pssh .clients import ParallelSSHClient
84+ client = ParallelSSHClient (hosts ,port = 2222 )
85+ output = client .run_command ('curl http://169.254.169.254/opc/v1/host/' )
86+ for host_out in output :
87+ j = json .loads (bytearray ('' .join (list (host_out .stdout )).encode ()))
88+ try :
89+ rackID = j ['rdmaTopologyData' ]['customerLocalBlock' ]
90+ except :
91+ rackID = j ['rackId' ]
92+ if rackID in r :
93+ r [rackID ].append ( host_out .host )
94+ else :
95+ r [rackID ] = [ host_out .host ]
96+ hostname_output = client .run_command ('/usr/bin/hostname' )
97+ for host_out in hostname_output :
98+ j = bytearray ('' .join (list (host_out .stdout )).encode ())
99+ friendly_name_to_system_hostname [host_out .host ] = j .decode (encoding = 'ascii' )
100+ #print(j.decode(encoding='ascii')+" "+host_out.host)
101+ except ImportError :
102+ try :
103+ for h in hosts :
104+ out = subprocess .run (["ssh " + h + " \" curl -s http://169.254.169.254/opc/v1/host/\" " ],stdout = subprocess .PIPE , stderr = subprocess .STDOUT , shell = True , universal_newlines = True , check = True )
105+ x = out .stdout .splitlines ()
106+ json_str = '' .join (x )
107+ json_data = json .loads (json_str )
108+ rackId = json_data .get ("rackId" , None )
109+ if rackId in r :
110+ r [rackId ].append ( h )
111+ else :
112+ r [rackId ] = [ h ]
113+ for h in hosts :
114+ out = subprocess .run (["ssh " + h + " /usr/bin/hostname" ],stdout = subprocess .PIPE , stderr = subprocess .STDOUT , shell = True , universal_newlines = True , check = True )
115+ x = out .stdout .splitlines ()
116+ friendly_name_to_system_hostname [h ] = x [0 ]
117+ except subprocess .CalledProcessError as e_process_error :
118+ exit (f"Error code: { e_process_error .returncode } Output: { e_process_error .output } " )
119+
120+
121+ ordered_hosts = []
122+ ordered_hosts_friendly_name = []
123+ # sort racks by amount of hosts (descending)
124+ racks_sorted = sorted (r .items (), key = lambda x : len (x [1 ]), reverse = True )
125+ i = 0
126+ fhandler = open ("node_switch_list" ,"w" )
127+ for k , v in racks_sorted :
128+ i += 1
129+ print (f'# rack { i } ' )
130+ rack_data_prefix = "SwitchName=rack" + str (i )+ " Nodes="
131+ rack_nodes = []
132+ for h in v :
133+ fhandler .write ("Node " + h + " from switch number " + str (i )+ "\n " )
134+ print (h )
135+ ordered_hosts .append (h )
136+ ordered_hosts_friendly_name .append (friendly_name_to_system_hostname [h ])
137+ rack_nodes .append (friendly_name_to_system_hostname [h ])
138+ rack_data = rack_data_prefix + ',' .join ([str (node ) for node in rack_nodes ])
139+ fhandler .close ()
140+ hostfile = "ordered_hostfile"
141+ write_ordered_hostfile (ordered_hosts ,hostfile )
142+ hostfile = "ordered_hostfile_system_name"
143+ write_ordered_hostfile (ordered_hosts_friendly_name ,hostfile )
144+ hostfile = "ordered_hostfile_system_name_srun"
145+ write_ordered_hostfile (ordered_hosts_friendly_name ,hostfile ,True )
146+ rankfile = "rankfile_system_name"
147+ write_ordered_rankfile (ordered_hosts_friendly_name ,rankfile )
0 commit comments