@@ -111,12 +111,12 @@ def getDefaultsConfig(config,queue_name):
111111 for instance_type in partition ["instance_types" ]:
112112 if "default" in instance_type .keys ():
113113 if instance_type ["default" ]:
114- return {"queue" :partition ["name" ], "instance_type" :instance_type ["name" ], "shape" :instance_type ["shape" ], "cluster_network" :instance_type ["cluster_network" ], "instance_keyword " :instance_type ["instance_keyword " ]}
114+ return {"queue" :partition ["name" ], "instance_type" :instance_type ["name" ], "shape" :instance_type ["shape" ], "cluster_network" :instance_type ["cluster_network" ], "hostname_convention " :instance_type ["hostname_convention " ]}
115115 if len (partition ["instance_types" ])> 0 :
116116 instance_type = partition ["instance_types" ][0 ]
117117 print ("No default configuration was found, there may be a problem in your queues.conf file" )
118118 print ("Selecting " + instance_type ["name" ]+ " as default" )
119- return {"queue" :partition ["name" ], "instance_type" :instance_type ["name" ], "shape" :instance_type ["shape" ], "cluster_network" :instance_type ["cluster_network" ], "instance_keyword " :instance_type ["instance_keyword " ]}
119+ return {"queue" :partition ["name" ], "instance_type" :instance_type ["name" ], "shape" :instance_type ["shape" ], "cluster_network" :instance_type ["cluster_network" ], "hostname_convention " :instance_type ["hostname_convention " ]}
120120 print ("The queue " + queue_name + " was not found in the queues.conf file" )
121121 return None
122122
@@ -125,7 +125,7 @@ def getJobConfig(config,queue_name,instance_type_name):
125125 if queue_name == partition ["name" ]:
126126 for instance_type in partition ["instance_types" ]:
127127 if instance_type_name == instance_type ["name" ]:
128- return {"queue" :partition ["name" ], "instance_type" :instance_type ["name" ], "shape" :instance_type ["shape" ], "cluster_network" :instance_type ["cluster_network" ], "instance_keyword " :instance_type ["instance_keyword " ]}
128+ return {"queue" :partition ["name" ], "instance_type" :instance_type ["name" ], "shape" :instance_type ["shape" ], "cluster_network" :instance_type ["cluster_network" ], "hostname_convention " :instance_type ["hostname_convention " ]}
129129 return None
130130
131131def getQueueLimits (config ,queue_name ,instance_type_name ):
@@ -136,11 +136,11 @@ def getQueueLimits(config,queue_name,instance_type_name):
136136 return {"max_number_nodes" : int (instance_type ["max_number_nodes" ]), "max_cluster_size" : int (instance_type ["max_cluster_size" ]),"max_cluster_count" : int (instance_type ["max_cluster_count" ])}
137137 return {"max_number_nodes" : 0 , "max_cluster_size" : 0 ,"max_cluster_count" : 0 }
138138
139- def getInstanceType (config ,queue_name ,instance_keyword ):
139+ def getInstanceType (config ,queue_name ,hostname_convention ):
140140 for partition in config :
141141 if queue_name == partition ["name" ]:
142142 for instance_type in partition ["instance_types" ]:
143- if instance_keyword == instance_type ["instance_keyword " ]:
143+ if hostname_convention == instance_type ["hostname_convention " ]:
144144 return instance_type ["name" ]
145145 return None
146146
@@ -161,26 +161,33 @@ def getAllClusterNames(config):
161161 return availableNames
162162
163163def getClusterName (node ):
164- out = subprocess .Popen (['scontrol' ,'show' ,'topology' ,node ], stdout = subprocess .PIPE , stderr = subprocess .STDOUT , universal_newlines = True )
165- stdout ,stderr = out .communicate ()
166- clusterName = None
167- try :
168- if len (stdout .split ('\n ' )) > 2 :
169- for output in stdout .split ('\n ' )[:- 1 ]:
170- if "Switches=" in output :
171- clusterName = output .split ()[0 ].split ('SwitchName=' )[1 ]
172- break
173- elif "SwitchName=inactive-" in output :
174- continue
175- else :
176- clusterName = output .split ()[0 ].split ('SwitchName=' )[1 ]
177- elif len (stdout .split ('\n ' )) == 2 :
178- clusterName = stdout .split ('\n ' )[0 ].split ()[0 ].split ('SwitchName=' )[1 ]
179- if clusterName .startswith ("inactive-" ):
164+ details = getNodeDetails (node )
165+ clusterName = "NOCLUSTERFOUND"
166+ for feature in details [0 ].split ("," ):
167+ if feature .startswith ('CN__' ):
168+ clusterName = feature [4 :]
169+ if clusterName == "NOCLUSTERFOUND" :
170+ out = subprocess .Popen (['scontrol' ,'show' ,'topology' ,node ], stdout = subprocess .PIPE , stderr = subprocess .STDOUT , universal_newlines = True )
171+ stdout ,stderr = out .communicate ()
172+ clusterName = None
173+ try :
174+ if len (stdout .split ('\n ' )) > 2 :
175+ for output in stdout .split ('\n ' )[:- 1 ]:
176+ if "Switches=" in output :
177+ clusterName = output .split ()[0 ].split ('SwitchName=' )[1 ]
178+ break
179+ elif "SwitchName=inactive-" in output :
180+ continue
181+ else :
182+ clusterName = output .split ()[0 ].split ('SwitchName=' )[1 ]
183+ elif len (stdout .split ('\n ' )) == 2 :
184+ clusterName = stdout .split ('\n ' )[0 ].split ()[0 ].split ('SwitchName=' )[1 ]
185+ if clusterName .startswith ("inactive-" ):
186+ return "NOCLUSTERFOUND"
187+ except :
188+ print ('No ClusterName could be found for ' + node )
189+ print ('There seems to be some issues in the slurm topology file' )
180190 return "NOCLUSTERFOUND"
181- except :
182- print ('No ClusterName could be found for ' + node )
183- return "NOCLUSTERFOUND"
184191 return clusterName
185192
186193def getstatus_slurm ():
@@ -246,7 +253,7 @@ def getstatus_slurm():
246253 clustername = getClusterName (node )
247254 if clustername is None :
248255 continue
249- instanceType = features [- 1 ]
256+ instanceType = features [0 ]
250257 if queue in current_nodes .keys ():
251258 if instanceType in current_nodes [queue ].keys ():
252259 current_nodes [queue ][instanceType ]+= 1
@@ -276,7 +283,9 @@ def getstatus_slurm():
276283 cluster_to_destroy = []
277284 for clustername in nodes_to_destroy_temp .keys ():
278285 destroyEntireCluster = True
279- if clustername in running_cluster or clustername == "NOCLUSTERFOUND" :
286+ if clustername == "NOCLUSTERFOUND" :
287+ destroyEntireCluster = False
288+ elif clustername in running_cluster :
280289 nodes_to_destroy [clustername ]= nodes_to_destroy_temp [clustername ]
281290 destroyEntireCluster = False
282291 else :
@@ -295,10 +304,10 @@ def getstatus_slurm():
295304 for clusterName in os .listdir (clusters_path ):
296305 if len (clusterName .split ('-' )) < 3 :
297306 continue
298- instance_keyword = '-' .join (clusterName .split ('-' )[2 :])
307+ hostname_convention = '-' .join (clusterName .split ('-' )[2 :])
299308 clusterNumber = int (clusterName .split ('-' )[1 ])
300309 queue = clusterName .split ('-' )[0 ]
301- instanceType = getInstanceType (config ,queue ,instance_keyword )
310+ instanceType = getInstanceType (config ,queue ,hostname_convention )
302311 if not queue in used_index .keys ():
303312 used_index [queue ]= {}
304313 if not instanceType in used_index [queue ].keys ():
@@ -311,19 +320,19 @@ def getstatus_slurm():
311320 nodes = line .split ()[0 ]
312321 instance_type = line .split ()[1 ]
313322 queue = line .split ()[2 ]
314- try :
315- cluster_building .append ([int (nodes ),instance_type ,queue ])
316- if queue in building_nodes .keys ():
317- if instance_type in building_nodes [queue ].keys ():
318- building_nodes [queue ][instance_type ]+= int (nodes )
323+ try :
324+ cluster_building .append ([int (nodes ),instance_type ,queue ])
325+ if queue in building_nodes .keys ():
326+ if instance_type in building_nodes [queue ].keys ():
327+ building_nodes [queue ][instance_type ]+= int (nodes )
328+ else :
329+ building_nodes [queue ][instance_type ]= int (nodes )
319330 else :
320- building_nodes [queue ][instance_type ]= int (nodes )
321- else :
322- building_nodes [queue ]= {instance_type :int (nodes )}
323- except ValueError :
324- print ('The cluster ' + clusterName + ' does not have a valid entry for \" currently_building\" ' )
325- print ('Ignoring' )
326- continue
331+ building_nodes [queue ]= {instance_type :int (nodes )}
332+ except ValueError :
333+ print ('The cluster ' + clusterName + ' does not have a valid entry for \" currently_building\" ' )
334+ print ('Ignoring' )
335+ continue
327336 if os .path .isfile (os .path .join (clusters_path ,clusterName ,'currently_destroying' )):
328337 cluster_destroying .append (clusterName )
329338 return cluster_to_build ,cluster_to_destroy ,nodes_to_destroy ,cluster_building ,cluster_destroying ,used_index ,current_nodes ,building_nodes
@@ -422,7 +431,7 @@ if autoscaling == "true":
422431 nextIndex = i
423432 used_index [queue ][instance_type ].append (i )
424433 break
425- clusterName = queue + '-' + str (nextIndex )+ '-' + jobconfig ["instance_keyword " ]
434+ clusterName = queue + '-' + str (nextIndex )+ '-' + jobconfig ["hostname_convention " ]
426435 if not queue in current_nodes .keys ():
427436 current_nodes [queue ]= {instance_type :0 }
428437 else :
@@ -448,5 +457,5 @@ if autoscaling == "true":
448457 traceback .print_exc ()
449458 os .remove (lockfile )
450459else :
451- print ("Autoscaling is false" )
460+ print ("Autoscaling is false (set in /etc/ansible/hosts) " )
452461 exit ()
0 commit comments